diff options
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC')
81 files changed, 60643 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp new file mode 100644 index 0000000..220c70a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -0,0 +1,1971 @@ +//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "PPCTargetStreamer.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static const MCPhysReg RRegs[32] = { + PPC::R0, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; +static const MCPhysReg RRegsNoR0[32] = { + PPC::ZERO, + PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; +static const MCPhysReg XRegs[32] = { + PPC::X0, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; +static const MCPhysReg XRegsNoX0[32] = { + PPC::ZERO8, + PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; +static const MCPhysReg FRegs[32] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31 +}; +static const MCPhysReg VRegs[32] = { + PPC::V0, PPC::V1, PPC::V2, PPC::V3, + PPC::V4, PPC::V5, PPC::V6, PPC::V7, + PPC::V8, PPC::V9, PPC::V10, PPC::V11, + PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; +static const MCPhysReg VSRegs[64] = { + PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3, + PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7, + PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11, + PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15, + PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19, + PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23, + PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27, + PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31, + + PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3, + PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, + PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, + PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15, + PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19, + PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23, + PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27, + PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31 +}; +static const MCPhysReg VSFRegs[64] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; +static const MCPhysReg VSSRegs[64] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; +static unsigned QFRegs[32] = { + PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3, + PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, + PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, + PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15, + PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19, + PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23, + PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27, + PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31 +}; +static const MCPhysReg CRBITRegs[32] = { + PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN, + PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN, + PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN, + PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN, + PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN, + PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, + PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, + PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN +}; +static const MCPhysReg CRRegs[8] = { + PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, + PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7 +}; + +// Evaluate an expression containing condition register +// or condition register field symbols. Returns positive +// value on success, or -1 on error. +static int64_t +EvaluateCRExpr(const MCExpr *E) { + switch (E->getKind()) { + case MCExpr::Target: + return -1; + + case MCExpr::Constant: { + int64_t Res = cast<MCConstantExpr>(E)->getValue(); + return Res < 0 ? -1 : Res; + } + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + StringRef Name = SRE->getSymbol().getName(); + + if (Name == "lt") return 0; + if (Name == "gt") return 1; + if (Name == "eq") return 2; + if (Name == "so") return 3; + if (Name == "un") return 3; + + if (Name == "cr0") return 0; + if (Name == "cr1") return 1; + if (Name == "cr2") return 2; + if (Name == "cr3") return 3; + if (Name == "cr4") return 4; + if (Name == "cr5") return 5; + if (Name == "cr6") return 6; + if (Name == "cr7") return 7; + + return -1; + } + + case MCExpr::Unary: + return -1; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + int64_t LHSVal = EvaluateCRExpr(BE->getLHS()); + int64_t RHSVal = EvaluateCRExpr(BE->getRHS()); + int64_t Res; + + if (LHSVal < 0 || RHSVal < 0) + return -1; + + switch (BE->getOpcode()) { + default: return -1; + case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break; + case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break; + } + + return Res < 0 ? -1 : Res; + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +namespace { + +struct PPCOperand; + +class PPCAsmParser : public MCTargetAsmParser { + const MCInstrInfo &MII; + bool IsPPC64; + bool IsDarwin; + + void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); } + + bool isPPC64() const { return IsPPC64; } + bool isDarwin() const { return IsDarwin; } + + bool MatchRegisterName(const AsmToken &Tok, + unsigned &RegNo, int64_t &IntVal); + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + + const MCExpr *ExtractModifierFromExpr(const MCExpr *E, + PPCMCExpr::VariantKind &Variant); + const MCExpr *FixupVariantKind(const MCExpr *E); + bool ParseExpression(const MCExpr *&EVal); + bool ParseDarwinExpression(const MCExpr *&EVal); + + bool ParseOperand(OperandVector &Operands); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + bool ParseDirectiveTC(unsigned Size, SMLoc L); + bool ParseDirectiveMachine(SMLoc L); + bool ParseDarwinDirectiveMachine(SMLoc L); + bool ParseDirectiveAbiVersion(SMLoc L); + bool ParseDirectiveLocalEntry(SMLoc L); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + + void ProcessInstruction(MCInst &Inst, const OperandVector &Ops); + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "PPCGenAsmMatcher.inc" + + /// } + + +public: + PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI), MII(MII) { + // Check for 64-bit vs. 32-bit pointer mode. + Triple TheTriple(STI.getTargetTriple()); + IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || + TheTriple.getArch() == Triple::ppc64le); + IsDarwin = TheTriple.isMacOSX(); + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + bool ParseDirective(AsmToken DirectiveID) override; + + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; + + const MCExpr *applyModifierToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind, + MCContext &Ctx) override; +}; + +/// PPCOperand - Instances of this class represent a parsed PowerPC machine +/// instruction. +struct PPCOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate, + ContextImmediate, + Expression, + TLSRegister + } Kind; + + SMLoc StartLoc, EndLoc; + bool IsPPC64; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + int64_t Val; + }; + + struct ExprOp { + const MCExpr *Val; + int64_t CRVal; // Cached result of EvaluateCRExpr(Val) + }; + + struct TLSRegOp { + const MCSymbolRefExpr *Sym; + }; + + union { + struct TokOp Tok; + struct ImmOp Imm; + struct ExprOp Expr; + struct TLSRegOp TLSReg; + }; + + PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} +public: + PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + IsPPC64 = o.IsPPC64; + switch (Kind) { + case Token: + Tok = o.Tok; + break; + case Immediate: + case ContextImmediate: + Imm = o.Imm; + break; + case Expression: + Expr = o.Expr; + break; + case TLSRegister: + TLSReg = o.TLSReg; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + + /// isPPC64 - True if this operand is for an instruction in 64-bit mode. + bool isPPC64() const { return IsPPC64; } + + int64_t getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + int64_t getImmS16Context() const { + assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!"); + if (Kind == Immediate) + return Imm.Val; + return static_cast<int16_t>(Imm.Val); + } + int64_t getImmU16Context() const { + assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getExpr() const { + assert(Kind == Expression && "Invalid access!"); + return Expr.Val; + } + + int64_t getExprCRVal() const { + assert(Kind == Expression && "Invalid access!"); + return Expr.CRVal; + } + + const MCExpr *getTLSReg() const { + assert(Kind == TLSRegister && "Invalid access!"); + return TLSReg.Sym; + } + + unsigned getReg() const override { + assert(isRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val; + } + + unsigned getVSReg() const { + assert(isVSRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val; + } + + unsigned getCCReg() const { + assert(isCCRegNumber() && "Invalid access!"); + return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal); + } + + unsigned getCRBit() const { + assert(isCRBitNumber() && "Invalid access!"); + return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal); + } + + unsigned getCRBitMask() const { + assert(isCRBitMask() && "Invalid access!"); + return 7 - countTrailingZeros<uint64_t>(Imm.Val); + } + + bool isToken() const override { return Kind == Token; } + bool isImm() const override { return Kind == Immediate || Kind == Expression; } + bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); } + bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); } + bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); } + bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); } + bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); } + bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); } + bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); } + bool isU6ImmX2() const { return Kind == Immediate && + isUInt<6>(getImm()) && + (getImm() & 1) == 0; } + bool isU7ImmX4() const { return Kind == Immediate && + isUInt<7>(getImm()) && + (getImm() & 3) == 0; } + bool isU8ImmX8() const { return Kind == Immediate && + isUInt<8>(getImm()) && + (getImm() & 7) == 0; } + + bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); } + bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); } + bool isU16Imm() const { + switch (Kind) { + case Expression: + return true; + case Immediate: + case ContextImmediate: + return isUInt<16>(getImmU16Context()); + default: + return false; + } + } + bool isS16Imm() const { + switch (Kind) { + case Expression: + return true; + case Immediate: + case ContextImmediate: + return isInt<16>(getImmS16Context()); + default: + return false; + } + } + bool isS16ImmX4() const { return Kind == Expression || + (Kind == Immediate && isInt<16>(getImm()) && + (getImm() & 3) == 0); } + bool isS17Imm() const { + switch (Kind) { + case Expression: + return true; + case Immediate: + case ContextImmediate: + return isInt<17>(getImmS16Context()); + default: + return false; + } + } + bool isTLSReg() const { return Kind == TLSRegister; } + bool isDirectBr() const { + if (Kind == Expression) + return true; + if (Kind != Immediate) + return false; + // Operand must be 64-bit aligned, signed 27-bit immediate. + if ((getImm() & 3) != 0) + return false; + if (isInt<26>(getImm())) + return true; + if (!IsPPC64) { + // In 32-bit mode, large 32-bit quantities wrap around. + if (isUInt<32>(getImm()) && isInt<26>(static_cast<int32_t>(getImm()))) + return true; + } + return false; + } + bool isCondBr() const { return Kind == Expression || + (Kind == Immediate && isInt<16>(getImm()) && + (getImm() & 3) == 0); } + bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); } + bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); } + bool isCCRegNumber() const { return (Kind == Expression + && isUInt<3>(getExprCRVal())) || + (Kind == Immediate + && isUInt<3>(getImm())); } + bool isCRBitNumber() const { return (Kind == Expression + && isUInt<5>(getExprCRVal())) || + (Kind == Immediate + && isUInt<5>(getImm())); } + bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) && + isPowerOf2_32(getImm()); } + bool isMem() const override { return false; } + bool isReg() const override { return false; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + llvm_unreachable("addRegOperands"); + } + + void addRegGPRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); + } + + void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(RRegsNoR0[getReg()])); + } + + void addRegG8RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(XRegs[getReg()])); + } + + void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(XRegsNoX0[getReg()])); + } + + void addRegGxRCOperands(MCInst &Inst, unsigned N) const { + if (isPPC64()) + addRegG8RCOperands(Inst, N); + else + addRegGPRCOperands(Inst, N); + } + + void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const { + if (isPPC64()) + addRegG8RCNoX0Operands(Inst, N); + else + addRegGPRCNoR0Operands(Inst, N); + } + + void addRegF4RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(FRegs[getReg()])); + } + + void addRegF8RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(FRegs[getReg()])); + } + + void addRegVRRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VRegs[getReg()])); + } + + void addRegVSRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VSRegs[getVSReg()])); + } + + void addRegVSFRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VSFRegs[getVSReg()])); + } + + void addRegVSSRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); + } + + void addRegQFRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + + void addRegQSRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + + void addRegQBRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + + void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()])); + } + + void addRegCRRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(CRRegs[getCCReg()])); + } + + void addCRBitMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(CRRegs[getCRBitMask()])); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (Kind == Immediate) + Inst.addOperand(MCOperand::createImm(getImm())); + else + Inst.addOperand(MCOperand::createExpr(getExpr())); + } + + void addS16ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + switch (Kind) { + case Immediate: + Inst.addOperand(MCOperand::createImm(getImm())); + break; + case ContextImmediate: + Inst.addOperand(MCOperand::createImm(getImmS16Context())); + break; + default: + Inst.addOperand(MCOperand::createExpr(getExpr())); + break; + } + } + + void addU16ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + switch (Kind) { + case Immediate: + Inst.addOperand(MCOperand::createImm(getImm())); + break; + case ContextImmediate: + Inst.addOperand(MCOperand::createImm(getImmU16Context())); + break; + default: + Inst.addOperand(MCOperand::createExpr(getExpr())); + break; + } + } + + void addBranchTargetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (Kind == Immediate) + Inst.addOperand(MCOperand::createImm(getImm() / 4)); + else + Inst.addOperand(MCOperand::createExpr(getExpr())); + } + + void addTLSRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createExpr(getTLSReg())); + } + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + void print(raw_ostream &OS) const override; + + static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) { + // Allocate extra memory for the string and copy it. + // FIXME: This is incorrect, Operands are owned by unique_ptr with a default + // deleter which will destroy them by simply using "delete", not correctly + // calling operator delete on this extra memory after calling the dtor + // explicitly. + void *Mem = ::operator new(sizeof(PPCOperand) + Str.size()); + std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token)); + Op->Tok.Data = reinterpret_cast<const char *>(Op.get() + 1); + Op->Tok.Length = Str.size(); + std::memcpy(const_cast<char *>(Op->Tok.Data), Str.data(), Str.size()); + Op->StartLoc = S; + Op->EndLoc = S; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Immediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S, + SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Expression); + Op->Expr.Val = Val; + Op->Expr.CRVal = EvaluateCRExpr(Val); + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(TLSRegister); + Op->TLSReg.Sym = Sym; + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(ContextImmediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) { + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val)) + return CreateImm(CE->getValue(), S, E, IsPPC64); + + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val)) + if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS) + return CreateTLSReg(SRE, S, E, IsPPC64); + + if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) { + int64_t Res; + if (TE->evaluateAsConstant(Res)) + return CreateContextImm(Res, S, E, IsPPC64); + } + + return CreateExpr(Val, S, E, IsPPC64); + } +}; + +} // end anonymous namespace. + +void PPCOperand::print(raw_ostream &OS) const { + switch (Kind) { + case Token: + OS << "'" << getToken() << "'"; + break; + case Immediate: + case ContextImmediate: + OS << getImm(); + break; + case Expression: + OS << *getExpr(); + break; + case TLSRegister: + OS << *getTLSReg(); + break; + } +} + +static void +addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) { + if (Op.isImm()) { + Inst.addOperand(MCOperand::createImm(-Op.getImm())); + return; + } + const MCExpr *Expr = Op.getExpr(); + if (const MCUnaryExpr *UnExpr = dyn_cast<MCUnaryExpr>(Expr)) { + if (UnExpr->getOpcode() == MCUnaryExpr::Minus) { + Inst.addOperand(MCOperand::createExpr(UnExpr->getSubExpr())); + return; + } + } else if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) { + if (BinExpr->getOpcode() == MCBinaryExpr::Sub) { + const MCExpr *NE = MCBinaryExpr::createSub(BinExpr->getRHS(), + BinExpr->getLHS(), Ctx); + Inst.addOperand(MCOperand::createExpr(NE)); + return; + } + } + Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::createMinus(Expr, Ctx))); +} + +void PPCAsmParser::ProcessInstruction(MCInst &Inst, + const OperandVector &Operands) { + int Opcode = Inst.getOpcode(); + switch (Opcode) { + case PPC::DCBTx: + case PPC::DCBTT: + case PPC::DCBTSTx: + case PPC::DCBTSTT: { + MCInst TmpInst; + TmpInst.setOpcode((Opcode == PPC::DCBTx || Opcode == PPC::DCBTT) ? + PPC::DCBT : PPC::DCBTST); + TmpInst.addOperand(MCOperand::createImm( + (Opcode == PPC::DCBTx || Opcode == PPC::DCBTSTx) ? 0 : 16)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + break; + } + case PPC::DCBTCT: + case PPC::DCBTDS: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::DCBT); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + break; + } + case PPC::DCBTSTCT: + case PPC::DCBTSTDS: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::DCBTST); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + break; + } + case PPC::LAx: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::LA); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + break; + } + case PPC::SUBI: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::ADDI); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + addNegOperand(TmpInst, Inst.getOperand(2), getContext()); + Inst = TmpInst; + break; + } + case PPC::SUBIS: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::ADDIS); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + addNegOperand(TmpInst, Inst.getOperand(2), getContext()); + Inst = TmpInst; + break; + } + case PPC::SUBIC: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::ADDIC); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + addNegOperand(TmpInst, Inst.getOperand(2), getContext()); + Inst = TmpInst; + break; + } + case PPC::SUBICo: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::ADDICo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + addNegOperand(TmpInst, Inst.getOperand(2), getContext()); + Inst = TmpInst; + break; + } + case PPC::EXTLWI: + case PPC::EXTLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(B)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(N - 1)); + Inst = TmpInst; + break; + } + case PPC::EXTRWI: + case PPC::EXTRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(B + N)); + TmpInst.addOperand(MCOperand::createImm(32 - N)); + TmpInst.addOperand(MCOperand::createImm(31)); + Inst = TmpInst; + break; + } + case PPC::INSLWI: + case PPC::INSLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(32 - B)); + TmpInst.addOperand(MCOperand::createImm(B)); + TmpInst.addOperand(MCOperand::createImm((B + N) - 1)); + Inst = TmpInst; + break; + } + case PPC::INSRWI: + case PPC::INSRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(32 - (B + N))); + TmpInst.addOperand(MCOperand::createImm(B)); + TmpInst.addOperand(MCOperand::createImm((B + N) - 1)); + Inst = TmpInst; + break; + } + case PPC::ROTRWI: + case PPC::ROTRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(32 - N)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(31)); + Inst = TmpInst; + break; + } + case PPC::SLWI: + case PPC::SLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(N)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::SRWI: + case PPC::SRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(32 - N)); + TmpInst.addOperand(MCOperand::createImm(N)); + TmpInst.addOperand(MCOperand::createImm(31)); + Inst = TmpInst; + break; + } + case PPC::CLRRWI: + case PPC::CLRRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::CLRLSLWI: + case PPC::CLRLSLWIo: { + MCInst TmpInst; + int64_t B = Inst.getOperand(2).getImm(); + int64_t N = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(N)); + TmpInst.addOperand(MCOperand::createImm(B - N)); + TmpInst.addOperand(MCOperand::createImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::EXTLDI: + case PPC::EXTLDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(B)); + TmpInst.addOperand(MCOperand::createImm(N - 1)); + Inst = TmpInst; + break; + } + case PPC::EXTRDI: + case PPC::EXTRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(B + N)); + TmpInst.addOperand(MCOperand::createImm(64 - N)); + Inst = TmpInst; + break; + } + case PPC::INSRDI: + case PPC::INSRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(64 - (B + N))); + TmpInst.addOperand(MCOperand::createImm(B)); + Inst = TmpInst; + break; + } + case PPC::ROTRDI: + case PPC::ROTRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(64 - N)); + TmpInst.addOperand(MCOperand::createImm(0)); + Inst = TmpInst; + break; + } + case PPC::SLDI: + case PPC::SLDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(N)); + TmpInst.addOperand(MCOperand::createImm(63 - N)); + Inst = TmpInst; + break; + } + case PPC::SRDI: + case PPC::SRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(64 - N)); + TmpInst.addOperand(MCOperand::createImm(N)); + Inst = TmpInst; + break; + } + case PPC::CLRRDI: + case PPC::CLRRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(MCOperand::createImm(63 - N)); + Inst = TmpInst; + break; + } + case PPC::CLRLSLDI: + case PPC::CLRLSLDIo: { + MCInst TmpInst; + int64_t B = Inst.getOperand(2).getImm(); + int64_t N = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(N)); + TmpInst.addOperand(MCOperand::createImm(B - N)); + Inst = TmpInst; + break; + } + case PPC::RLWINMbm: + case PPC::RLWINMobm: { + unsigned MB, ME; + int64_t BM = Inst.getOperand(3).getImm(); + if (!isRunOfOnes(BM, MB, ME)) + break; + + MCInst TmpInst; + TmpInst.setOpcode(Opcode == PPC::RLWINMbm ? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(MCOperand::createImm(MB)); + TmpInst.addOperand(MCOperand::createImm(ME)); + Inst = TmpInst; + break; + } + case PPC::RLWIMIbm: + case PPC::RLWIMIobm: { + unsigned MB, ME; + int64_t BM = Inst.getOperand(3).getImm(); + if (!isRunOfOnes(BM, MB, ME)) + break; + + MCInst TmpInst; + TmpInst.setOpcode(Opcode == PPC::RLWIMIbm ? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); // The tied operand. + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(MCOperand::createImm(MB)); + TmpInst.addOperand(MCOperand::createImm(ME)); + Inst = TmpInst; + break; + } + case PPC::RLWNMbm: + case PPC::RLWNMobm: { + unsigned MB, ME; + int64_t BM = Inst.getOperand(3).getImm(); + if (!isRunOfOnes(BM, MB, ME)) + break; + + MCInst TmpInst; + TmpInst.setOpcode(Opcode == PPC::RLWNMbm ? PPC::RLWNM : PPC::RLWNMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(MCOperand::createImm(MB)); + TmpInst.addOperand(MCOperand::createImm(ME)); + Inst = TmpInst; + break; + } + case PPC::MFTB: { + if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) { + assert(Inst.getNumOperands() == 2 && "Expecting two operands"); + Inst.setOpcode(PPC::MFSPR); + } + break; + } + } +} + +bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + case Match_Success: + // Post-process instructions (typically extended mnemonics) + ProcessInstruction(Inst, Operands); + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, getSTI()); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction use requires an option to be enabled"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + } + + llvm_unreachable("Implement any new match types added!"); +} + +bool PPCAsmParser:: +MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) { + if (Tok.is(AsmToken::Identifier)) { + StringRef Name = Tok.getString(); + + if (Name.equals_lower("lr")) { + RegNo = isPPC64()? PPC::LR8 : PPC::LR; + IntVal = 8; + return false; + } else if (Name.equals_lower("ctr")) { + RegNo = isPPC64()? PPC::CTR8 : PPC::CTR; + IntVal = 9; + return false; + } else if (Name.equals_lower("vrsave")) { + RegNo = PPC::VRSAVE; + IntVal = 256; + return false; + } else if (Name.startswith_lower("r") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal]; + return false; + } else if (Name.startswith_lower("f") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = FRegs[IntVal]; + return false; + } else if (Name.startswith_lower("vs") && + !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) { + RegNo = VSRegs[IntVal]; + return false; + } else if (Name.startswith_lower("v") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = VRegs[IntVal]; + return false; + } else if (Name.startswith_lower("q") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = QFRegs[IntVal]; + return false; + } else if (Name.startswith_lower("cr") && + !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { + RegNo = CRRegs[IntVal]; + return false; + } + } + + return true; +} + +bool PPCAsmParser:: +ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + RegNo = 0; + int64_t IntVal; + + if (!MatchRegisterName(Tok, RegNo, IntVal)) { + Parser.Lex(); // Eat identifier token. + return false; + } + + return Error(StartLoc, "invalid register name"); +} + +/// Extract \code @l/@ha \endcode modifier from expression. Recursively scan +/// the expression and check for VK_PPC_LO/HI/HA +/// symbol variants. If all symbols with modifier use the same +/// variant, return the corresponding PPCMCExpr::VariantKind, +/// and a modified expression using the default symbol variant. +/// Otherwise, return NULL. +const MCExpr *PPCAsmParser:: +ExtractModifierFromExpr(const MCExpr *E, + PPCMCExpr::VariantKind &Variant) { + MCContext &Context = getParser().getContext(); + Variant = PPCMCExpr::VK_PPC_None; + + switch (E->getKind()) { + case MCExpr::Target: + case MCExpr::Constant: + return nullptr; + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + + switch (SRE->getKind()) { + case MCSymbolRefExpr::VK_PPC_LO: + Variant = PPCMCExpr::VK_PPC_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Variant = PPCMCExpr::VK_PPC_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Variant = PPCMCExpr::VK_PPC_HA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHER: + Variant = PPCMCExpr::VK_PPC_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_HIGHERA: + Variant = PPCMCExpr::VK_PPC_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHEST: + Variant = PPCMCExpr::VK_PPC_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + Variant = PPCMCExpr::VK_PPC_HIGHESTA; + break; + default: + return nullptr; + } + + return MCSymbolRefExpr::create(&SRE->getSymbol(), Context); + } + + case MCExpr::Unary: { + const MCUnaryExpr *UE = cast<MCUnaryExpr>(E); + const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant); + if (!Sub) + return nullptr; + return MCUnaryExpr::create(UE->getOpcode(), Sub, Context); + } + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + PPCMCExpr::VariantKind LHSVariant, RHSVariant; + const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant); + const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant); + + if (!LHS && !RHS) + return nullptr; + + if (!LHS) LHS = BE->getLHS(); + if (!RHS) RHS = BE->getRHS(); + + if (LHSVariant == PPCMCExpr::VK_PPC_None) + Variant = RHSVariant; + else if (RHSVariant == PPCMCExpr::VK_PPC_None) + Variant = LHSVariant; + else if (LHSVariant == RHSVariant) + Variant = LHSVariant; + else + return nullptr; + + return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context); + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace +/// them by VK_PPC_TLSGD/VK_PPC_TLSLD. This is necessary to avoid having +/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT. +/// FIXME: This is a hack. +const MCExpr *PPCAsmParser:: +FixupVariantKind(const MCExpr *E) { + MCContext &Context = getParser().getContext(); + + switch (E->getKind()) { + case MCExpr::Target: + case MCExpr::Constant: + return E; + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + + switch (SRE->getKind()) { + case MCSymbolRefExpr::VK_TLSGD: + Variant = MCSymbolRefExpr::VK_PPC_TLSGD; + break; + case MCSymbolRefExpr::VK_TLSLD: + Variant = MCSymbolRefExpr::VK_PPC_TLSLD; + break; + default: + return E; + } + return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, Context); + } + + case MCExpr::Unary: { + const MCUnaryExpr *UE = cast<MCUnaryExpr>(E); + const MCExpr *Sub = FixupVariantKind(UE->getSubExpr()); + if (Sub == UE->getSubExpr()) + return E; + return MCUnaryExpr::create(UE->getOpcode(), Sub, Context); + } + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + const MCExpr *LHS = FixupVariantKind(BE->getLHS()); + const MCExpr *RHS = FixupVariantKind(BE->getRHS()); + if (LHS == BE->getLHS() && RHS == BE->getRHS()) + return E; + return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context); + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +/// ParseExpression. This differs from the default "parseExpression" in that +/// it handles modifiers. +bool PPCAsmParser:: +ParseExpression(const MCExpr *&EVal) { + + if (isDarwin()) + return ParseDarwinExpression(EVal); + + // (ELF Platforms) + // Handle \code @l/@ha \endcode + if (getParser().parseExpression(EVal)) + return true; + + EVal = FixupVariantKind(EVal); + + PPCMCExpr::VariantKind Variant; + const MCExpr *E = ExtractModifierFromExpr(EVal, Variant); + if (E) + EVal = PPCMCExpr::create(Variant, E, false, getParser().getContext()); + + return false; +} + +/// ParseDarwinExpression. (MachO Platforms) +/// This differs from the default "parseExpression" in that it handles detection +/// of the \code hi16(), ha16() and lo16() \endcode modifiers. At present, +/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO +/// syntax form so it is done here. TODO: Determine if there is merit in arranging +/// for this to be done at a higher level. +bool PPCAsmParser:: +ParseDarwinExpression(const MCExpr *&EVal) { + MCAsmParser &Parser = getParser(); + PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None; + switch (getLexer().getKind()) { + default: + break; + case AsmToken::Identifier: + // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus + // something starting with any other char should be part of the + // asm syntax. If handwritten asm includes an identifier like lo16, + // then all bets are off - but no-one would do that, right? + StringRef poss = Parser.getTok().getString(); + if (poss.equals_lower("lo16")) { + Variant = PPCMCExpr::VK_PPC_LO; + } else if (poss.equals_lower("hi16")) { + Variant = PPCMCExpr::VK_PPC_HI; + } else if (poss.equals_lower("ha16")) { + Variant = PPCMCExpr::VK_PPC_HA; + } + if (Variant != PPCMCExpr::VK_PPC_None) { + Parser.Lex(); // Eat the xx16 + if (getLexer().isNot(AsmToken::LParen)) + return Error(Parser.getTok().getLoc(), "expected '('"); + Parser.Lex(); // Eat the '(' + } + break; + } + + if (getParser().parseExpression(EVal)) + return true; + + if (Variant != PPCMCExpr::VK_PPC_None) { + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "expected ')'"); + Parser.Lex(); // Eat the ')' + EVal = PPCMCExpr::create(Variant, EVal, false, getParser().getContext()); + } + return false; +} + +/// ParseOperand +/// This handles registers in the form 'NN', '%rNN' for ELF platforms and +/// rNN for MachO. +bool PPCAsmParser::ParseOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + const MCExpr *EVal; + + // Attempt to parse the next token as an immediate + switch (getLexer().getKind()) { + // Special handling for register names. These are interpreted + // as immediates corresponding to the register number. + case AsmToken::Percent: + Parser.Lex(); // Eat the '%'. + unsigned RegNo; + int64_t IntVal; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + return false; + } + return Error(S, "invalid register name"); + + case AsmToken::Identifier: + // Note that non-register-name identifiers from the compiler will begin + // with '_', 'L'/'l' or '"'. Of course, handwritten asm could include + // identifiers like r31foo - so we fall through in the event that parsing + // a register name fails. + if (isDarwin()) { + unsigned RegNo; + int64_t IntVal; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + return false; + } + } + // Fall-through to process non-register-name identifiers as expression. + // All other expressions + case AsmToken::LParen: + case AsmToken::Plus: + case AsmToken::Minus: + case AsmToken::Integer: + case AsmToken::Dot: + case AsmToken::Dollar: + case AsmToken::Exclaim: + case AsmToken::Tilde: + if (!ParseExpression(EVal)) + break; + /* fall through */ + default: + return Error(S, "unknown operand"); + } + + // Push the parsed operand into the list of operands + Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64())); + + // Check whether this is a TLS call expression + bool TLSCall = false; + if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal)) + TLSCall = Ref->getSymbol().getName() == "__tls_get_addr"; + + if (TLSCall && getLexer().is(AsmToken::LParen)) { + const MCExpr *TLSSym; + + Parser.Lex(); // Eat the '('. + S = Parser.getTok().getLoc(); + if (ParseExpression(TLSSym)) + return Error(S, "invalid TLS call expression"); + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "missing ')'"); + E = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the ')'. + + Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64())); + } + + // Otherwise, check for D-form memory operands + if (!TLSCall && getLexer().is(AsmToken::LParen)) { + Parser.Lex(); // Eat the '('. + S = Parser.getTok().getLoc(); + + int64_t IntVal; + switch (getLexer().getKind()) { + case AsmToken::Percent: + Parser.Lex(); // Eat the '%'. + unsigned RegNo; + if (MatchRegisterName(Parser.getTok(), RegNo, IntVal)) + return Error(S, "invalid register name"); + Parser.Lex(); // Eat the identifier token. + break; + + case AsmToken::Integer: + if (!isDarwin()) { + if (getParser().parseAbsoluteExpression(IntVal) || + IntVal < 0 || IntVal > 31) + return Error(S, "invalid register number"); + } else { + return Error(S, "unexpected integer value"); + } + break; + + case AsmToken::Identifier: + if (isDarwin()) { + unsigned RegNo; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + break; + } + } + // Fall-through.. + + default: + return Error(S, "invalid memory operand"); + } + + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "missing ')'"); + E = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the ')'. + + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + } + + return false; +} + +/// Parse an instruction mnemonic followed by its operands. +bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + // The first operand is the token for the instruction name. + // If the next character is a '+' or '-', we need to add it to the + // instruction name, to match what TableGen is doing. + std::string NewOpcode; + if (getLexer().is(AsmToken::Plus)) { + getLexer().Lex(); + NewOpcode = Name; + NewOpcode += '+'; + Name = NewOpcode; + } + if (getLexer().is(AsmToken::Minus)) { + getLexer().Lex(); + NewOpcode = Name; + NewOpcode += '-'; + Name = NewOpcode; + } + // If the instruction ends in a '.', we need to create a separate + // token for it, to match what TableGen is doing. + size_t Dot = Name.find('.'); + StringRef Mnemonic = Name.slice(0, Dot); + if (!NewOpcode.empty()) // Underlying memory for Name is volatile. + Operands.push_back( + PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64())); + else + Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64())); + if (Dot != StringRef::npos) { + SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot); + StringRef DotStr = Name.slice(Dot, StringRef::npos); + if (!NewOpcode.empty()) // Underlying memory for Name is volatile. + Operands.push_back( + PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64())); + else + Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64())); + } + + // If there are no more operands then finish + if (getLexer().is(AsmToken::EndOfStatement)) + return false; + + // Parse the first operand + if (ParseOperand(Operands)) + return true; + + while (getLexer().isNot(AsmToken::EndOfStatement) && + getLexer().is(AsmToken::Comma)) { + // Consume the comma token + getLexer().Lex(); + + // Parse the next operand + if (ParseOperand(Operands)) + return true; + } + + // We'll now deal with an unfortunate special case: the syntax for the dcbt + // and dcbtst instructions differs for server vs. embedded cores. + // The syntax for dcbt is: + // dcbt ra, rb, th [server] + // dcbt th, ra, rb [embedded] + // where th can be omitted when it is 0. dcbtst is the same. We take the + // server form to be the default, so swap the operands if we're parsing for + // an embedded core (they'll be swapped again upon printing). + if (getSTI().getFeatureBits()[PPC::FeatureBookE] && + Operands.size() == 4 && + (Name == "dcbt" || Name == "dcbtst")) { + std::swap(Operands[1], Operands[3]); + std::swap(Operands[2], Operands[1]); + } + + return false; +} + +/// ParseDirective parses the PPC specific directives +bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (!isDarwin()) { + if (IDVal == ".word") + return ParseDirectiveWord(2, DirectiveID.getLoc()); + if (IDVal == ".llong") + return ParseDirectiveWord(8, DirectiveID.getLoc()); + if (IDVal == ".tc") + return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc()); + if (IDVal == ".machine") + return ParseDirectiveMachine(DirectiveID.getLoc()); + if (IDVal == ".abiversion") + return ParseDirectiveAbiVersion(DirectiveID.getLoc()); + if (IDVal == ".localentry") + return ParseDirectiveLocalEntry(DirectiveID.getLoc()); + } else { + if (IDVal == ".machine") + return ParseDarwinDirectiveMachine(DirectiveID.getLoc()); + } + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); + if (getParser().parseExpression(Value)) + return false; + + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveTC +/// ::= .tc [ symbol (, expression)* ] +bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); + // Skip TC symbol, which is only used with XCOFF. + while (getLexer().isNot(AsmToken::EndOfStatement) + && getLexer().isNot(AsmToken::Comma)) + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + + // Align to word size. + getParser().getStreamer().EmitValueToAlignment(Size); + + // Emit expressions. + return ParseDirectiveWord(Size, L); +} + +/// ParseDirectiveMachine (ELF platforms) +/// ::= .machine [ cpu | "push" | "pop" ] +bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::Identifier) && + getLexer().isNot(AsmToken::String)) { + Error(L, "unexpected token in directive"); + return false; + } + + StringRef CPU = Parser.getTok().getIdentifier(); + Parser.Lex(); + + // FIXME: Right now, the parser always allows any available + // instruction, so the .machine directive is not useful. + // Implement ".machine any" (by doing nothing) for the benefit + // of existing assembler code. Likewise, we can then implement + // ".machine push" and ".machine pop" as no-op. + if (CPU != "any" && CPU != "push" && CPU != "pop") { + Error(L, "unrecognized machine type"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitMachine(CPU); + + return false; +} + +/// ParseDarwinDirectiveMachine (Mach-o platforms) +/// ::= .machine cpu-identifier +bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::Identifier) && + getLexer().isNot(AsmToken::String)) { + Error(L, "unexpected token in directive"); + return false; + } + + StringRef CPU = Parser.getTok().getIdentifier(); + Parser.Lex(); + + // FIXME: this is only the 'default' set of cpu variants. + // However we don't act on this information at present, this is simply + // allowing parsing to proceed with minimal sanity checking. + if (CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64") { + Error(L, "unrecognized cpu type"); + return false; + } + + if (isPPC64() && (CPU == "ppc7400" || CPU == "ppc")) { + Error(L, "wrong cpu type specified for 64bit"); + return false; + } + if (!isPPC64() && CPU == "ppc64") { + Error(L, "wrong cpu type specified for 32bit"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + return false; +} + +/// ParseDirectiveAbiVersion +/// ::= .abiversion constant-expression +bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { + int64_t AbiVersion; + if (getParser().parseAbsoluteExpression(AbiVersion)){ + Error(L, "expected constant expression"); + return false; + } + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitAbiVersion(AbiVersion); + + return false; +} + +/// ParseDirectiveLocalEntry +/// ::= .localentry symbol, expression +bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { + StringRef Name; + if (getParser().parseIdentifier(Name)) { + Error(L, "expected identifier in directive"); + return false; + } + MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name)); + + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Lex(); + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) { + Error(L, "expected expression"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitLocalEntry(Sym, Expr); + + return false; +} + + + +/// Force static initialization. +extern "C" void LLVMInitializePowerPCAsmParser() { + RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target); + RegisterMCAsmParser<PPCAsmParser> B(ThePPC64Target); + RegisterMCAsmParser<PPCAsmParser> C(ThePPC64LETarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "PPCGenAsmMatcher.inc" + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + // If the kind is a token for a literal immediate, check if our asm + // operand matches. This is for InstAliases which have a fixed-value + // immediate in the syntax. + int64_t ImmVal; + switch (Kind) { + case MCK_0: ImmVal = 0; break; + case MCK_1: ImmVal = 1; break; + case MCK_2: ImmVal = 2; break; + case MCK_3: ImmVal = 3; break; + case MCK_4: ImmVal = 4; break; + case MCK_5: ImmVal = 5; break; + case MCK_6: ImmVal = 6; break; + case MCK_7: ImmVal = 7; break; + default: return Match_InvalidOperand; + } + + PPCOperand &Op = static_cast<PPCOperand &>(AsmOp); + if (Op.isImm() && Op.getImm() == ImmVal) + return Match_Success; + + return Match_InvalidOperand; +} + +const MCExpr * +PPCAsmParser::applyModifierToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind Variant, + MCContext &Ctx) { + switch (Variant) { + case MCSymbolRefExpr::VK_PPC_LO: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HI: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HA: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHER: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHERA: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHEST: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx); + default: + return nullptr; + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp new file mode 100644 index 0000000..1fc84fb --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -0,0 +1,408 @@ +//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-disassembler" + +typedef MCDisassembler::DecodeStatus DecodeStatus; + +namespace { +class PPCDisassembler : public MCDisassembler { + bool IsLittleEndian; + +public: + PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + bool IsLittleEndian) + : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {} + + DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; +}; +} // end anonymous namespace + +static MCDisassembler *createPPCDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false); +} + +static MCDisassembler *createPPCLEDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true); +} + +extern "C" void LLVMInitializePowerPCDisassembler() { + // Register the disassembler for each target. + TargetRegistry::RegisterMCDisassembler(ThePPC32Target, + createPPCDisassembler); + TargetRegistry::RegisterMCDisassembler(ThePPC64Target, + createPPCDisassembler); + TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget, + createPPCLEDisassembler); +} + +// FIXME: These can be generated by TableGen from the existing register +// encoding values! + +static const unsigned CRRegs[] = { + PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, + PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7 +}; + +static const unsigned CRBITRegs[] = { + PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN, + PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN, + PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN, + PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN, + PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN, + PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, + PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, + PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN +}; + +static const unsigned FRegs[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31 +}; + +static const unsigned VRegs[] = { + PPC::V0, PPC::V1, PPC::V2, PPC::V3, + PPC::V4, PPC::V5, PPC::V6, PPC::V7, + PPC::V8, PPC::V9, PPC::V10, PPC::V11, + PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; + +static const unsigned VSRegs[] = { + PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3, + PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7, + PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11, + PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15, + PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19, + PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23, + PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27, + PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31, + + PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3, + PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, + PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, + PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15, + PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19, + PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23, + PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27, + PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31 +}; + +static const unsigned VSFRegs[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; + +static const unsigned VSSRegs[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; + +static const unsigned GPRegs[] = { + PPC::R0, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; + +static const unsigned GP0Regs[] = { + PPC::ZERO, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; + +static const unsigned G8Regs[] = { + PPC::X0, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; + +static const unsigned QFRegs[] = { + PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3, + PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, + PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, + PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15, + PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19, + PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23, + PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27, + PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31 +}; + +template <std::size_t N> +static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo, + const unsigned (&Regs)[N]) { + assert(RegNo < N && "Invalid register number"); + Inst.addOperand(MCOperand::createReg(Regs[RegNo])); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRRegs); +} + +static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRRegs); +} + +static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRBITRegs); +} + +static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, FRegs); +} + +static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, FRegs); +} + +static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VRegs); +} + +static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSRegs); +} + +static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSFRegs); +} + +static DecodeStatus DecodeVSSRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSSRegs); +} + +static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, GPRegs); +} + +static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, GP0Regs); +} + +static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, G8Regs); +} + +#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass +#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass + +static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, QFRegs); +} + +#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass +#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass + +template<unsigned N> +static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + assert(isUInt<N>(Imm) && "Invalid immediate"); + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} + +template<unsigned N> +static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + assert(isUInt<N>(Imm) && "Invalid immediate"); + Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm))); + return MCDisassembler::Success; +} + +static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // Decode the memri field (imm, reg), which has the low 16-bits as the + // displacement and the next 5 bits as the register #. + + uint64_t Base = Imm >> 16; + uint64_t Disp = Imm & 0xFFFF; + + assert(Base < 32 && "Invalid base register"); + + switch (Inst.getOpcode()) { + default: break; + case PPC::LBZU: + case PPC::LHAU: + case PPC::LHZU: + case PPC::LWZU: + case PPC::LFSU: + case PPC::LFDU: + // Add the tied output operand. + Inst.addOperand(MCOperand::createReg(GP0Regs[Base])); + break; + case PPC::STBU: + case PPC::STHU: + case PPC::STWU: + case PPC::STFSU: + case PPC::STFDU: + Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base])); + break; + } + + Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp))); + Inst.addOperand(MCOperand::createReg(GP0Regs[Base])); + return MCDisassembler::Success; +} + +static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // Decode the memrix field (imm, reg), which has the low 14-bits as the + // displacement and the next 5 bits as the register #. + + uint64_t Base = Imm >> 14; + uint64_t Disp = Imm & 0x3FFF; + + assert(Base < 32 && "Invalid base register"); + + if (Inst.getOpcode() == PPC::LDU) + // Add the tied output operand. + Inst.addOperand(MCOperand::createReg(GP0Regs[Base])); + else if (Inst.getOpcode() == PPC::STDU) + Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base])); + + Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 2))); + Inst.addOperand(MCOperand::createReg(GP0Regs[Base])); + return MCDisassembler::Success; +} + +static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // The cr bit encoding is 0x80 >> cr_reg_num. + + unsigned Zeros = countTrailingZeros(Imm); + assert(Zeros < 8 && "Invalid CR bit value"); + + Inst.addOperand(MCOperand::createReg(CRRegs[7 - Zeros])); + return MCDisassembler::Success; +} + +#include "PPCGenDisassemblerTables.inc" + +DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, raw_ostream &OS, + raw_ostream &CS) const { + // Get the four bytes of the instruction. + Size = 4; + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + // Read the instruction in the proper endianness. + uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data()) + : support::endian::read32be(Bytes.data()); + + if (STI.getFeatureBits()[PPC::FeatureQPX]) { + DecodeStatus result = + decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); + if (result != MCDisassembler::Fail) + return result; + } + + return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp new file mode 100644 index 0000000..5e1d227 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -0,0 +1,450 @@ +//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstPrinter.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// FIXME: Once the integrated assembler supports full register names, tie this +// to the verbose-asm setting. +static cl::opt<bool> +FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false), + cl::desc("Use full register names when printing assembly")); + +#define PRINT_ALIAS_INSTR +#include "PPCGenAsmWriter.inc" + +void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + const char *RegName = getRegisterName(RegNo); + if (RegName[0] == 'q' /* QPX */) { + // The system toolchain on the BG/Q does not understand QPX register names + // in .cfi_* directives, so print the name of the floating-point + // subregister instead. + std::string RN(RegName); + + RN[0] = 'f'; + OS << RN; + + return; + } + + OS << RegName; +} + +void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + // Check for slwi/srwi mnemonics. + if (MI->getOpcode() == PPC::RLWINM) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char MB = MI->getOperand(3).getImm(); + unsigned char ME = MI->getOperand(4).getImm(); + bool useSubstituteMnemonic = false; + if (SH <= 31 && MB == 0 && ME == (31-SH)) { + O << "\tslwi "; useSubstituteMnemonic = true; + } + if (SH <= 31 && MB == (32-SH) && ME == 31) { + O << "\tsrwi "; useSubstituteMnemonic = true; + SH = 32-SH; + } + if (useSubstituteMnemonic) { + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + + printAnnotation(O, Annot); + return; + } + } + + if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) && + MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + O << "\tmr "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + printAnnotation(O, Annot); + return; + } + + if (MI->getOpcode() == PPC::RLDICR) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char ME = MI->getOperand(3).getImm(); + // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH + if (63-SH == ME) { + O << "\tsldi "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + printAnnotation(O, Annot); + return; + } + } + + // dcbt[st] is printed manually here because: + // 1. The assembly syntax is different between embedded and server targets + // 2. We must print the short mnemonics for TH == 0 because the + // embedded/server syntax default will not be stable across assemblers + // The syntax for dcbt is: + // dcbt ra, rb, th [server] + // dcbt th, ra, rb [embedded] + // where th can be omitted when it is 0. dcbtst is the same. + if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) { + unsigned char TH = MI->getOperand(0).getImm(); + O << "\tdcbt"; + if (MI->getOpcode() == PPC::DCBTST) + O << "st"; + if (TH == 16) + O << "t"; + O << " "; + + bool IsBookE = STI.getFeatureBits()[PPC::FeatureBookE]; + if (IsBookE && TH != 0 && TH != 16) + O << (unsigned int) TH << ", "; + + printOperand(MI, 1, O); + O << ", "; + printOperand(MI, 2, O); + + if (!IsBookE && TH != 0 && TH != 16) + O << ", " << (unsigned int) TH; + + printAnnotation(O, Annot); + return; + } + + // For fast-isel, a COPY_TO_REGCLASS may survive this long. This is + // used when converting a 32-bit float to a 64-bit float as part of + // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()), + // as otherwise we have problems with incorrect register classes + // in machine instruction verification. For now, just avoid trying + // to print it as such an instruction has no effect (a 32-bit float + // in a register is already in 64-bit form, just with lower + // precision). FIXME: Is there a better solution? + if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS) + return; + + if (!printAliasInstr(MI, O)) + printInstruction(MI, O); + printAnnotation(O, Annot); +} + + +void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, + const char *Modifier) { + unsigned Code = MI->getOperand(OpNo).getImm(); + + if (StringRef(Modifier) == "cc") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT_MINUS: + case PPC::PRED_LT_PLUS: + case PPC::PRED_LT: + O << "lt"; + return; + case PPC::PRED_LE_MINUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_LE: + O << "le"; + return; + case PPC::PRED_EQ_MINUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_EQ: + O << "eq"; + return; + case PPC::PRED_GE_MINUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GE: + O << "ge"; + return; + case PPC::PRED_GT_MINUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_GT: + O << "gt"; + return; + case PPC::PRED_NE_MINUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_NE: + O << "ne"; + return; + case PPC::PRED_UN_MINUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_UN: + O << "un"; + return; + case PPC::PRED_NU_MINUS: + case PPC::PRED_NU_PLUS: + case PPC::PRED_NU: + O << "nu"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + if (StringRef(Modifier) == "pm") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT: + case PPC::PRED_LE: + case PPC::PRED_EQ: + case PPC::PRED_GE: + case PPC::PRED_GT: + case PPC::PRED_NE: + case PPC::PRED_UN: + case PPC::PRED_NU: + return; + case PPC::PRED_LT_MINUS: + case PPC::PRED_LE_MINUS: + case PPC::PRED_EQ_MINUS: + case PPC::PRED_GE_MINUS: + case PPC::PRED_GT_MINUS: + case PPC::PRED_NE_MINUS: + case PPC::PRED_UN_MINUS: + case PPC::PRED_NU_MINUS: + O << "-"; + return; + case PPC::PRED_LT_PLUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_NU_PLUS: + O << "+"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + assert(StringRef(Modifier) == "reg" && + "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 1 && "Invalid u1imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 3 && "Invalid u2imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 8 && "Invalid u3imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 15 && "Invalid u4imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Value = MI->getOperand(OpNo).getImm(); + Value = SignExtend32<5>(Value); + O << (int)Value; +} + +void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 31 && "Invalid u5imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 63 && "Invalid u6imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned short Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 1023 && "Invalid u10imm argument!"); + O << (unsigned short)Value; +} + +void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned short Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 4095 && "Invalid u12imm argument!"); + O << (unsigned short)Value; +} + +void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (unsigned short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + // Branches can take an immediate operand. This is used by the branch + // selection pass to print .+8, an eight byte displacement from the PC. + O << ".+"; + printAbsBranchOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); +} + + +void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CCReg = MI->getOperand(OpNo).getReg(); + unsigned RegNo; + switch (CCReg) { + default: llvm_unreachable("Unknown CR register"); + case PPC::CR0: RegNo = 0; break; + case PPC::CR1: RegNo = 1; break; + case PPC::CR2: RegNo = 2; break; + case PPC::CR3: RegNo = 3; break; + case PPC::CR4: RegNo = 4; break; + case PPC::CR5: RegNo = 5; break; + case PPC::CR6: RegNo = 6; break; + case PPC::CR7: RegNo = 7; break; + } + O << (0x80 >> RegNo); +} + +void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printS16ImmOperand(MI, OpNo, O); + O << '('; + if (MI->getOperand(OpNo+1).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo+1, O); + O << ')'; +} + +void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // When used as the base register, r0 reads constant zero rather than + // the value contained in the register. For this reason, the darwin + // assembler requires that we print r0 as 0 (no r) when used as the base. + if (MI->getOperand(OpNo).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must + // come at the _end_ of the expression. + const MCOperand &Op = MI->getOperand(OpNo); + const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr()); + O << refExp.getSymbol().getName(); + O << '('; + printOperand(MI, OpNo+1, O); + O << ')'; + if (refExp.getKind() != MCSymbolRefExpr::VK_None) + O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind()); +} + + +/// stripRegisterPrefix - This method strips the character prefix from a +/// register name so that only the number is left. Used by for linux asm. +static const char *stripRegisterPrefix(const char *RegName) { + if (FullRegNames) + return RegName; + + switch (RegName[0]) { + case 'r': + case 'f': + case 'q': // for QPX + case 'v': + if (RegName[1] == 's') + return RegName + 2; + return RegName + 1; + case 'c': if (RegName[1] == 'r') return RegName + 2; + } + + return RegName; +} + +void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + const char *RegName = getRegisterName(Op.getReg()); + // The linux and AIX assembler does not take register prefixes. + if (!isDarwinSyntax()) + RegName = stripRegisterPrefix(RegName); + + O << RegName; + return; + } + + if (Op.isImm()) { + O << Op.getImm(); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h new file mode 100644 index 0000000..53eb727 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -0,0 +1,71 @@ +//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H +#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class PPCInstPrinter : public MCInstPrinter { + bool IsDarwin; +public: + PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI, bool isDarwin) + : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {} + + bool isDarwinSyntax() const { + return IsDarwin; + } + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + raw_ostream &OS); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier = nullptr); + + void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp new file mode 100644 index 0000000..b6dd595 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -0,0 +1,240 @@ +//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + case PPC::fixup_ppc_nofixup: + return Value; + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + return Value & 0xfffc; + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + return Value & 0x3fffffc; + case PPC::fixup_ppc_half16: + return Value & 0xffff; + case PPC::fixup_ppc_half16ds: + return Value & 0xfffc; + } +} + +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + return 1; + case FK_Data_2: + case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_half16ds: + return 2; + case FK_Data_4: + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + return 4; + case FK_Data_8: + return 8; + case PPC::fixup_ppc_nofixup: + return 0; + } +} + +namespace { + +class PPCAsmBackend : public MCAsmBackend { + const Target &TheTarget; + bool IsLittleEndian; +public: + PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T), + IsLittleEndian(isLittle) {} + + unsigned getNumFixupKinds() const override { + return PPC::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_ppc_br24", 6, 24, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_brcond14", 16, 14, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_br24abs", 6, 24, 0 }, + { "fixup_ppc_brcond14abs", 16, 14, 0 }, + { "fixup_ppc_half16", 0, 16, 0 }, + { "fixup_ppc_half16ds", 0, 14, 0 }, + { "fixup_ppc_nofixup", 0, 0, 0 } + }; + const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_ppc_br24", 2, 24, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_brcond14", 2, 14, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_br24abs", 2, 24, 0 }, + { "fixup_ppc_brcond14abs", 2, 14, 0 }, + { "fixup_ppc_half16", 0, 16, 0 }, + { "fixup_ppc_half16ds", 2, 14, 0 }, + { "fixup_ppc_nofixup", 0, 0, 0 } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override { + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + + // For each byte of the fragment that the fixup touches, mask in the bits + // from the fixup value. The Value has been "split up" into the appropriate + // bitfields above. + for (unsigned i = 0; i != NumBytes; ++i) { + unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i); + Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff); + } + } + + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override { + switch ((PPC::Fixups)Fixup.getKind()) { + default: break; + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + // If the target symbol has a local entry point we must not attempt + // to resolve the fixup directly. Emit a relocation and leave + // resolution of the final target address to the linker. + if (const MCSymbolRefExpr *A = Target.getSymA()) { + if (const auto *S = dyn_cast<MCSymbolELF>(&A->getSymbol())) { + // The "other" values are stored in the last 6 bits of the second + // byte. The traditional defines for STO values assume the full byte + // and thus the shift to pack it. + unsigned Other = S->getOther() << 2; + if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) + IsResolved = false; + } + } + break; + } + } + + bool mayNeedRelaxation(const MCInst &Inst) const override { + // FIXME. + return false; + } + + bool fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + // FIXME. + llvm_unreachable("relaxInstruction() unimplemented"); + } + + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + // FIXME. + llvm_unreachable("relaxInstruction() unimplemented"); + } + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->write32(0x60000000); + + OW->WriteZeros(Count % 4); + + return true; + } + + unsigned getPointerSize() const { + StringRef Name = TheTarget.getName(); + if (Name == "ppc64" || Name == "ppc64le") return 8; + assert(Name == "ppc32" && "Unknown target name!"); + return 4; + } + + bool isLittleEndian() const { + return IsLittleEndian; + } +}; +} // end anonymous namespace + + +// FIXME: This should be in a separate file. +namespace { + class DarwinPPCAsmBackend : public PPCAsmBackend { + public: + DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + bool is64 = getPointerSize() == 8; + return createPPCMachObjectWriter( + OS, + /*Is64Bit=*/is64, + (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC), + MachO::CPU_SUBTYPE_POWERPC_ALL); + } + }; + + class ELFPPCAsmBackend : public PPCAsmBackend { + uint8_t OSABI; + public: + ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) : + PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + bool is64 = getPointerSize() == 8; + return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI); + } + }; + +} // end anonymous namespace + +MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + if (TT.isOSDarwin()) + return new DarwinPPCAsmBackend(T); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); + bool IsLittleEndian = TT.getArch() == Triple::ppc64le; + return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp new file mode 100644 index 0000000..dd99495 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -0,0 +1,425 @@ +//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { + class PPCELFObjectWriter : public MCELFObjectTargetWriter { + public: + PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI); + + protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + + bool needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const override; + }; +} + +PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) + : MCELFObjectTargetWriter(Is64Bit, OSABI, + Is64Bit ? ELF::EM_PPC64 : ELF::EM_PPC, + /*HasRelocationAddend*/ true) {} + +static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target, + const MCFixup &Fixup) { + const MCExpr *Expr = Fixup.getValue(); + + if (Expr->getKind() != MCExpr::Target) + return Target.getAccessVariant(); + + switch (cast<PPCMCExpr>(Expr)->getKind()) { + case PPCMCExpr::VK_PPC_None: + return MCSymbolRefExpr::VK_None; + case PPCMCExpr::VK_PPC_LO: + return MCSymbolRefExpr::VK_PPC_LO; + case PPCMCExpr::VK_PPC_HI: + return MCSymbolRefExpr::VK_PPC_HI; + case PPCMCExpr::VK_PPC_HA: + return MCSymbolRefExpr::VK_PPC_HA; + case PPCMCExpr::VK_PPC_HIGHERA: + return MCSymbolRefExpr::VK_PPC_HIGHERA; + case PPCMCExpr::VK_PPC_HIGHER: + return MCSymbolRefExpr::VK_PPC_HIGHER; + case PPCMCExpr::VK_PPC_HIGHEST: + return MCSymbolRefExpr::VK_PPC_HIGHEST; + case PPCMCExpr::VK_PPC_HIGHESTA: + return MCSymbolRefExpr::VK_PPC_HIGHESTA; + } + llvm_unreachable("unknown PPCMCExpr kind"); +} + +unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup); + + // determine the type of the relocation + unsigned Type; + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: + llvm_unreachable("Unimplemented"); + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_REL24; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_PPC_PLTREL24; + break; + case MCSymbolRefExpr::VK_PPC_LOCAL: + Type = ELF::R_PPC_LOCAL24PC; + break; + } + break; + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + Type = ELF::R_PPC_REL14; + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_REL16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC_REL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = ELF::R_PPC_REL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Type = ELF::R_PPC_REL16_HA; + break; + } + break; + case PPC::fixup_ppc_half16ds: + Target.print(errs()); + errs() << '\n'; + report_fatal_error("Invalid PC-relative half16ds relocation"); + case FK_Data_4: + case FK_PCRel_4: + Type = ELF::R_PPC_REL32; + break; + case FK_Data_8: + case FK_PCRel_8: + Type = ELF::R_PPC64_REL64; + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + case PPC::fixup_ppc_br24abs: + Type = ELF::R_PPC_ADDR24; + break; + case PPC::fixup_ppc_brcond14abs: + Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_ + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_ADDR16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC_ADDR16_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = ELF::R_PPC_ADDR16_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Type = ELF::R_PPC_ADDR16_HA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHER: + Type = ELF::R_PPC64_ADDR16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_HIGHERA: + Type = ELF::R_PPC64_ADDR16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHEST: + Type = ELF::R_PPC64_ADDR16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + Type = ELF::R_PPC64_ADDR16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_PPC_GOT16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_LO: + Type = ELF::R_PPC_GOT16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_HI: + Type = ELF::R_PPC_GOT16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_HA: + Type = ELF::R_PPC_GOT16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TOC: + Type = ELF::R_PPC64_TOC16; + break; + case MCSymbolRefExpr::VK_PPC_TOC_LO: + Type = ELF::R_PPC64_TOC16_LO; + break; + case MCSymbolRefExpr::VK_PPC_TOC_HI: + Type = ELF::R_PPC64_TOC16_HI; + break; + case MCSymbolRefExpr::VK_PPC_TOC_HA: + Type = ELF::R_PPC64_TOC16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC_TPREL16; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_LO: + Type = ELF::R_PPC_TPREL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HI: + Type = ELF::R_PPC_TPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HA: + Type = ELF::R_PPC_TPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER: + Type = ELF::R_PPC64_TPREL16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA: + Type = ELF::R_PPC64_TPREL16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST: + Type = ELF::R_PPC64_TPREL16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA: + Type = ELF::R_PPC64_TPREL16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL16; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_LO: + Type = ELF::R_PPC64_DTPREL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HI: + Type = ELF::R_PPC64_DTPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HA: + Type = ELF::R_PPC64_DTPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER: + Type = ELF::R_PPC64_DTPREL16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA: + Type = ELF::R_PPC64_DTPREL16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST: + Type = ELF::R_PPC64_DTPREL16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA: + Type = ELF::R_PPC64_DTPREL16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD: + if (is64Bit()) + Type = ELF::R_PPC64_GOT_TLSGD16; + else + Type = ELF::R_PPC_GOT_TLSGD16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO: + Type = ELF::R_PPC64_GOT_TLSGD16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI: + Type = ELF::R_PPC64_GOT_TLSGD16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA: + Type = ELF::R_PPC64_GOT_TLSGD16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD: + if (is64Bit()) + Type = ELF::R_PPC64_GOT_TLSLD16; + else + Type = ELF::R_PPC_GOT_TLSLD16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO: + Type = ELF::R_PPC64_GOT_TLSLD16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI: + Type = ELF::R_PPC64_GOT_TLSLD16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA: + Type = ELF::R_PPC64_GOT_TLSLD16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL: + /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS. */ + Type = ELF::R_PPC64_GOT_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO: + /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS. */ + Type = ELF::R_PPC64_GOT_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI: + Type = ELF::R_PPC64_GOT_TPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL: + /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS. */ + Type = ELF::R_PPC64_GOT_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO: + /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS. */ + Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA: + Type = ELF::R_PPC64_GOT_TPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI: + Type = ELF::R_PPC64_GOT_DTPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA: + Type = ELF::R_PPC64_GOT_DTPREL16_HA; + break; + } + break; + case PPC::fixup_ppc_half16ds: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC64_ADDR16_DS; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC64_ADDR16_LO_DS; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_PPC64_GOT16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_LO: + Type = ELF::R_PPC64_GOT16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_TOC: + Type = ELF::R_PPC64_TOC16_DS; + break; + case MCSymbolRefExpr::VK_PPC_TOC_LO: + Type = ELF::R_PPC64_TOC16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC64_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_LO: + Type = ELF::R_PPC64_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_LO: + Type = ELF::R_PPC64_DTPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL: + Type = ELF::R_PPC64_GOT_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO: + Type = ELF::R_PPC64_GOT_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL: + Type = ELF::R_PPC64_GOT_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO: + Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS; + break; + } + break; + case PPC::fixup_ppc_nofixup: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_PPC_TLSGD: + if (is64Bit()) + Type = ELF::R_PPC64_TLSGD; + else + Type = ELF::R_PPC_TLSGD; + break; + case MCSymbolRefExpr::VK_PPC_TLSLD: + if (is64Bit()) + Type = ELF::R_PPC64_TLSLD; + else + Type = ELF::R_PPC_TLSLD; + break; + case MCSymbolRefExpr::VK_PPC_TLS: + if (is64Bit()) + Type = ELF::R_PPC64_TLS; + else + Type = ELF::R_PPC_TLS; + break; + } + break; + case FK_Data_8: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_PPC_TOCBASE: + Type = ELF::R_PPC64_TOC; + break; + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC64_ADDR64; + break; + case MCSymbolRefExpr::VK_PPC_DTPMOD: + Type = ELF::R_PPC64_DTPMOD64; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC64_TPREL64; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL64; + break; + } + break; + case FK_Data_4: + Type = ELF::R_PPC_ADDR32; + break; + case FK_Data_2: + Type = ELF::R_PPC_ADDR16; + break; + } + } + return Type; +} + +bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const { + switch (Type) { + default: + return false; + + case ELF::R_PPC_REL24: + // If the target symbol has a local entry point, we must keep the + // target symbol to preserve that information for the linker. + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = cast<MCSymbolELF>(Sym).getOther() << 2; + return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0; + } +} + +MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + bool IsLittleEndian, + uint8_t OSABI) { + MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI); + return createELFObjectWriter(MOTW, OS, IsLittleEndian); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h new file mode 100644 index 0000000..ae43e59d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -0,0 +1,56 @@ +//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +#undef PPC + +namespace llvm { +namespace PPC { +enum Fixups { + // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b' + // and 'bl'. + fixup_ppc_br24 = FirstTargetFixupKind, + + /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional + /// branches. + fixup_ppc_brcond14, + + /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches + /// like 'ba' and 'bla'. + fixup_ppc_br24abs, + + /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional + /// branches. + fixup_ppc_brcond14abs, + + /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo) + /// or ha16(_foo) for instrs like 'li' or 'addis'. + fixup_ppc_half16, + + /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with + /// implied 2 zero bits for instrs like 'std'. + fixup_ppc_half16ds, + + /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call + /// to __tls_get_addr for the TLS general and local dynamic models, + /// or inserts the thread-pointer register number. + fixup_ppc_nofixup, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp new file mode 100644 index 0000000..d8fab5b --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -0,0 +1,83 @@ +//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the MCAsmInfoDarwin properties. +// +//===----------------------------------------------------------------------===// + +#include "PPCMCAsmInfo.h" +#include "llvm/ADT/Triple.h" + +using namespace llvm; + +void PPCMCAsmInfoDarwin::anchor() { } + +PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { + if (is64Bit) { + PointerSize = CalleeSaveStackSlotSize = 8; + } + IsLittleEndian = false; + + CommentString = ";"; + ExceptionsType = ExceptionHandling::DwarfCFI; + + if (!is64Bit) + Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode. + + AssemblerDialect = 1; // New-Style mnemonics. + SupportsDebugInformation= true; // Debug information. + + // The installed assembler for OSX < 10.6 lacks some directives. + // FIXME: this should really be a check on the assembler characteristics + // rather than OS version + if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) + HasWeakDefCanBeHiddenDirective = false; + + UseIntegratedAssembler = true; +} + +void PPCELFMCAsmInfo::anchor() { } + +PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { + // FIXME: This is not always needed. For example, it is not needed in the + // v2 abi. + NeedsLocalForSize = true; + + if (is64Bit) { + PointerSize = CalleeSaveStackSlotSize = 8; + } + IsLittleEndian = T.getArch() == Triple::ppc64le; + + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + CommentString = "#"; + + // Uses '.section' before '.bss' directive + UsesELFSectionDirectiveForBSS = true; + + // Debug Information + SupportsDebugInformation = true; + + DollarIsPC = true; + + // Set up DWARF directives + MinInstAlignment = 4; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + ZeroDirective = "\t.space\t"; + Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr; + AssemblerDialect = 1; // New-Style mnemonics. + LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment; + + UseIntegratedAssembler = true; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h new file mode 100644 index 0000000..e252ac9 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -0,0 +1,39 @@ +//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the MCAsmInfoDarwin class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" + +namespace llvm { +class Triple; + +class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &); +}; + +class PPCELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &); +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp new file mode 100644 index 0000000..b729156 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -0,0 +1,360 @@ +//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); + +namespace { +class PPCMCCodeEmitter : public MCCodeEmitter { + PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete; + void operator=(const PPCMCCodeEmitter &) = delete; + + const MCInstrInfo &MCII; + const MCContext &CTX; + bool IsLittleEndian; + +public: + PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), CTX(ctx), + IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {} + + ~PPCMCCodeEmitter() override {} + + unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override { + // For fast-isel, a float COPY_TO_REGCLASS can survive this long. + // It's just a nop to keep the register classes happy, so don't + // generate anything. + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + if (Opcode == TargetOpcode::COPY_TO_REGCLASS) + return; + + uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); + + // Output the constant in big/little endian byte order. + unsigned Size = Desc.getSize(); + switch (Size) { + case 4: + if (IsLittleEndian) { + support::endian::Writer<support::little>(OS).write<uint32_t>(Bits); + } else { + support::endian::Writer<support::big>(OS).write<uint32_t>(Bits); + } + break; + case 8: + // If we emit a pair of instructions, the first one is + // always in the top 32 bits, even on little-endian. + if (IsLittleEndian) { + uint64_t Swapped = (Bits << 32) | (Bits >> 32); + support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped); + } else { + support::endian::Writer<support::big>(OS).write<uint64_t>(Bits); + } + break; + default: + llvm_unreachable ("Invalid instruction size"); + } + + ++MCNumEmitted; // Keep track of the # of mi's emitted. + } + +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new PPCMCCodeEmitter(MCII, Ctx); +} + +unsigned PPCMCCodeEmitter:: +getDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_br24)); + return 0; +} + +unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_brcond14)); + return 0; +} + +unsigned PPCMCCodeEmitter:: +getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_br24abs)); + return 0; +} + +unsigned PPCMCCodeEmitter:: +getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_brcond14abs)); + return 0; +} + +unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the immediate field. + Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16)); + return 0; +} + +unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Encode (imm, reg) as a memri, which has the low 16-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16; + + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits; + + // Add a fixup for the displacement field. + Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16)); + return RegBits; +} + + +unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Encode (imm, reg) as a memrix, which has the low 14-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14; + + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits; + + // Add a fixup for the displacement field. + Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16ds)); + return RegBits; +} + + +unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) + const { + // Encode (imm, reg) as a spe8dis, which has the low 5-bits of (imm / 8) + // as the displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5; + + const MCOperand &MO = MI.getOperand(OpNo); + assert(MO.isImm()); + uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 3; + return reverseBits(Imm | RegBits) >> 22; +} + + +unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) + const { + // Encode (imm, reg) as a spe4dis, which has the low 5-bits of (imm / 4) + // as the displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5; + + const MCOperand &MO = MI.getOperand(OpNo); + assert(MO.isImm()); + uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 2; + return reverseBits(Imm | RegBits) >> 22; +} + + +unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) + const { + // Encode (imm, reg) as a spe2dis, which has the low 5-bits of (imm / 2) + // as the displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5; + + const MCOperand &MO = MI.getOperand(OpNo); + assert(MO.isImm()); + uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 1; + return reverseBits(Imm | RegBits) >> 22; +} + + +unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the TLS register, which simply provides a relocation + // hint to the linker that this statement is part of a relocation sequence. + // Return the thread-pointer register's encoding. + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_nofixup)); + const Triple &TT = STI.getTargetTriple(); + bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le; + return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2); +} + +unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // For special TLS calls, we need two fixups; one for the branch target + // (__tls_get_addr), which we create via getDirectBrEncoding as usual, + // and one for the TLSGD or TLSLD symbol, which is emitted here. + const MCOperand &MO = MI.getOperand(OpNo+1); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_nofixup)); + return getDirectBrEncoding(MI, OpNo, Fixups, STI); +} + +unsigned PPCMCCodeEmitter:: +get_crbitm_encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 || + MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) && + (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)); + return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); +} + + +unsigned PPCMCCodeEmitter:: +getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand. + // The GPR operand should come through here though. + assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 && + MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) || + MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7); + return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + } + + assert(MO.isImm() && + "Relocation required in an instruction that we cannot encode!"); + return MO.getImm(); +} + + +#include "PPCGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp new file mode 100644 index 0000000..6b97d4c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -0,0 +1,150 @@ +//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCFixupKinds.h" +#include "PPCMCExpr.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppcmcexpr" + +const PPCMCExpr* +PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin); +} + +void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { + if (isDarwinSyntax()) { + switch (Kind) { + default: llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: OS << "lo16"; break; + case VK_PPC_HI: OS << "hi16"; break; + case VK_PPC_HA: OS << "ha16"; break; + } + + OS << '('; + getSubExpr()->print(OS, MAI); + OS << ')'; + } else { + getSubExpr()->print(OS, MAI); + + switch (Kind) { + default: llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: OS << "@l"; break; + case VK_PPC_HI: OS << "@h"; break; + case VK_PPC_HA: OS << "@ha"; break; + case VK_PPC_HIGHER: OS << "@higher"; break; + case VK_PPC_HIGHERA: OS << "@highera"; break; + case VK_PPC_HIGHEST: OS << "@highest"; break; + case VK_PPC_HIGHESTA: OS << "@highesta"; break; + } + } +} + +bool +PPCMCExpr::evaluateAsConstant(int64_t &Res) const { + MCValue Value; + + if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr)) + return false; + + if (!Value.isAbsolute()) + return false; + + Res = evaluateAsInt64(Value.getConstant()); + return true; +} + +int64_t +PPCMCExpr::evaluateAsInt64(int64_t Value) const { + switch (Kind) { + case VK_PPC_LO: + return Value & 0xffff; + case VK_PPC_HI: + return (Value >> 16) & 0xffff; + case VK_PPC_HA: + return ((Value + 0x8000) >> 16) & 0xffff; + case VK_PPC_HIGHER: + return (Value >> 32) & 0xffff; + case VK_PPC_HIGHERA: + return ((Value + 0x8000) >> 32) & 0xffff; + case VK_PPC_HIGHEST: + return (Value >> 48) & 0xffff; + case VK_PPC_HIGHESTA: + return ((Value + 0x8000) >> 48) & 0xffff; + case VK_PPC_None: + break; + } + llvm_unreachable("Invalid kind!"); +} + +bool +PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + MCValue Value; + + if (!getSubExpr()->evaluateAsRelocatable(Value, Layout, Fixup)) + return false; + + if (Value.isAbsolute()) { + int64_t Result = evaluateAsInt64(Value.getConstant()); + if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) && + (Result >= 0x8000)) + return false; + Res = MCValue::get(Result); + } else { + if (!Layout) + return false; + + MCContext &Context = Layout->getAssembler().getContext(); + const MCSymbolRefExpr *Sym = Value.getSymA(); + MCSymbolRefExpr::VariantKind Modifier = Sym->getKind(); + if (Modifier != MCSymbolRefExpr::VK_None) + return false; + switch (Kind) { + default: + llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: + Modifier = MCSymbolRefExpr::VK_PPC_LO; + break; + case VK_PPC_HI: + Modifier = MCSymbolRefExpr::VK_PPC_HI; + break; + case VK_PPC_HA: + Modifier = MCSymbolRefExpr::VK_PPC_HA; + break; + case VK_PPC_HIGHERA: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA; + break; + case VK_PPC_HIGHER: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHER; + break; + case VK_PPC_HIGHEST: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST; + break; + case VK_PPC_HIGHESTA: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA; + break; + } + Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context); + Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant()); + } + + return true; +} + +void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h new file mode 100644 index 0000000..d42a111 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -0,0 +1,100 @@ +//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H + +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" + +namespace llvm { + +class PPCMCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_PPC_None, + VK_PPC_LO, + VK_PPC_HI, + VK_PPC_HA, + VK_PPC_HIGHER, + VK_PPC_HIGHERA, + VK_PPC_HIGHEST, + VK_PPC_HIGHESTA + }; + +private: + const VariantKind Kind; + const MCExpr *Expr; + bool IsDarwin; + + int64_t evaluateAsInt64(int64_t Value) const; + + explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin) + : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {} + +public: + /// @name Construction + /// @{ + + static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr, + bool isDarwin, MCContext &Ctx); + + static const PPCMCExpr *createLo(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return create(VK_PPC_LO, Expr, isDarwin, Ctx); + } + + static const PPCMCExpr *createHi(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return create(VK_PPC_HI, Expr, isDarwin, Ctx); + } + + static const PPCMCExpr *createHa(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return create(VK_PPC_HA, Expr, isDarwin, Ctx); + } + + /// @} + /// @name Accessors + /// @{ + + /// getOpcode - Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// getSubExpr - Get the child of this expression. + const MCExpr *getSubExpr() const { return Expr; } + + /// isDarwinSyntax - True if expression is to be printed using Darwin syntax. + bool isDarwinSyntax() const { return IsDarwin; } + + + /// @} + + void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); + } + + // There are no TLS PPCMCExprs at the moment. + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} + + bool evaluateAsConstant(int64_t &Res) const; + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp new file mode 100644 index 0000000..30f232a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -0,0 +1,275 @@ +//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides PowerPC specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "PPCMCTargetDesc.h" +#include "InstPrinter/PPCInstPrinter.h" +#include "PPCMCAsmInfo.h" +#include "PPCTargetStreamer.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "PPCGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "PPCGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "PPCGenRegisterInfo.inc" + +// Pin the vtable to this file. +PPCTargetStreamer::~PPCTargetStreamer() {} +PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + +static MCInstrInfo *createPPCMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitPPCMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) { + bool isPPC64 = + (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); + unsigned Flavour = isPPC64 ? 0 : 1; + unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR; + + MCRegisterInfo *X = new MCRegisterInfo(); + InitPPCMCRegisterInfo(X, RA, Flavour, Flavour); + return X; +} + +static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { + return createPPCMCSubtargetInfoImpl(TT, CPU, FS); +} + +static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, + const Triple &TheTriple) { + bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 || + TheTriple.getArch() == Triple::ppc64le); + + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin()) + MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple); + else + MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple); + + // Initial state of the frame pointer is R1. + unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1; + MCCFIInstruction Inst = + MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0); + MAI->addInitialFrameState(Inst); + + return MAI; +} + +static MCCodeGenInfo *createPPCMCCodeGenInfo(const Triple &TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + + if (RM == Reloc::Default) { + if (TT.isOSDarwin()) + RM = Reloc::DynamicNoPIC; + else + RM = Reloc::Static; + } + if (CM == CodeModel::Default) { + if (!TT.isOSDarwin() && + (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) + CM = CodeModel::Medium; + } + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +namespace { +class PPCTargetAsmStreamer : public PPCTargetStreamer { + formatted_raw_ostream &OS; + +public: + PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) + : PPCTargetStreamer(S), OS(OS) {} + void emitTCEntry(const MCSymbol &S) override { + OS << "\t.tc "; + OS << S.getName(); + OS << "[TC],"; + OS << S.getName(); + OS << '\n'; + } + void emitMachine(StringRef CPU) override { + OS << "\t.machine " << CPU << '\n'; + } + void emitAbiVersion(int AbiVersion) override { + OS << "\t.abiversion " << AbiVersion << '\n'; + } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { + const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); + + OS << "\t.localentry\t"; + S->print(OS, MAI); + OS << ", "; + LocalOffset->print(OS, MAI); + OS << '\n'; + } +}; + +class PPCTargetELFStreamer : public PPCTargetStreamer { +public: + PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + MCELFStreamer &getStreamer() { + return static_cast<MCELFStreamer &>(Streamer); + } + void emitTCEntry(const MCSymbol &S) override { + // Creates a R_PPC64_TOC relocation + Streamer.EmitValueToAlignment(8); + Streamer.EmitSymbolValue(&S, 8); + } + void emitMachine(StringRef CPU) override { + // FIXME: Is there anything to do in here or does this directive only + // limit the parser? + } + void emitAbiVersion(int AbiVersion) override { + MCAssembler &MCA = getStreamer().getAssembler(); + unsigned Flags = MCA.getELFHeaderEFlags(); + Flags &= ~ELF::EF_PPC64_ABI; + Flags |= (AbiVersion & ELF::EF_PPC64_ABI); + MCA.setELFHeaderEFlags(Flags); + } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { + MCAssembler &MCA = getStreamer().getAssembler(); + + int64_t Res; + if (!LocalOffset->evaluateAsAbsolute(Res, MCA)) + report_fatal_error(".localentry expression must be absolute."); + + unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res); + if (Res != ELF::decodePPC64LocalEntryOffset(Encoded)) + report_fatal_error(".localentry expression cannot be encoded."); + + unsigned Other = S->getOther(); + Other &= ~ELF::STO_PPC64_LOCAL_MASK; + Other |= Encoded; + S->setOther(Other); + + // For GAS compatibility, unless we already saw a .abiversion directive, + // set e_flags to indicate ELFv2 ABI. + unsigned Flags = MCA.getELFHeaderEFlags(); + if ((Flags & ELF::EF_PPC64_ABI) == 0) + MCA.setELFHeaderEFlags(Flags | 2); + } + void emitAssignment(MCSymbol *S, const MCExpr *Value) override { + auto *Symbol = cast<MCSymbolELF>(S); + // When encoding an assignment to set symbol A to symbol B, also copy + // the st_other bits encoding the local entry point offset. + if (Value->getKind() != MCExpr::SymbolRef) + return; + const auto &RhsSym = cast<MCSymbolELF>( + static_cast<const MCSymbolRefExpr *>(Value)->getSymbol()); + unsigned Other = Symbol->getOther(); + Other &= ~ELF::STO_PPC64_LOCAL_MASK; + Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK; + Symbol->setOther(Other); + } +}; + +class PPCTargetMachOStreamer : public PPCTargetStreamer { +public: + PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + void emitTCEntry(const MCSymbol &S) override { + llvm_unreachable("Unknown pseudo-op: .tc"); + } + void emitMachine(StringRef CPU) override { + // FIXME: We should update the CPUType, CPUSubType in the Object file if + // the new values are different from the defaults. + } + void emitAbiVersion(int AbiVersion) override { + llvm_unreachable("Unknown pseudo-op: .abiversion"); + } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { + llvm_unreachable("Unknown pseudo-op: .localentry"); + } +}; +} + +static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new PPCTargetAsmStreamer(S, OS); +} + +static MCTargetStreamer * +createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + const Triple &TT = STI.getTargetTriple(); + if (TT.isOSBinFormatELF()) + return new PPCTargetELFStreamer(S); + return new PPCTargetMachOStreamer(S); +} + +static MCInstPrinter *createPPCMCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin()); +} + +extern "C" void LLVMInitializePowerPCTargetMC() { + for (Target *T : {&ThePPC32Target, &ThePPC64Target, &ThePPC64LETarget}) { + // Register the MC asm info. + RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createPPCMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createPPCMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend); + + // Register the object target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createObjectTargetStreamer); + + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createPPCMCInstPrinter); + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h new file mode 100644 index 0000000..77fe458 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -0,0 +1,104 @@ +//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides PowerPC specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/MathExtras.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class Triple; +class StringRef; +class raw_pwrite_stream; +class raw_ostream; + +extern Target ThePPC32Target; +extern Target ThePPC64Target; +extern Target ThePPC64LETarget; + +MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +/// Construct an PPC ELF object writer. +MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + bool IsLittleEndian, uint8_t OSABI); +/// Construct a PPC Mach-O object writer. +MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +/// Returns true iff Val consists of one contiguous run of 1s with any number of +/// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so +/// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is not, +/// since all 1s are not contiguous. +static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { + if (!Val) + return false; + + if (isShiftedMask_32(Val)) { + // look for the first non-zero bit + MB = countLeadingZeros(Val); + // look for the first zero bit after the run of ones + ME = countLeadingZeros((Val - 1) ^ Val); + return true; + } else { + Val = ~Val; // invert mask + if (isShiftedMask_32(Val)) { + // effectively look for the first zero bit + ME = countLeadingZeros(Val) - 1; + // effectively look for the first one bit after the run of zeros + MB = countLeadingZeros((Val - 1) ^ Val) + 1; + return true; + } + } + // no run present + return false; +} + +} // End llvm namespace + +// Generated files will use "namespace PPC". To avoid symbol clash, +// undefine PPC here. PPC may be predefined on some hosts. +#undef PPC + +// Defines symbolic names for PowerPC registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "PPCGenRegisterInfo.inc" + +// Defines symbolic names for the PowerPC instructions. +// +#define GET_INSTRINFO_ENUM +#include "PPCGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "PPCGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp new file mode 100644 index 0000000..b54a0e1 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -0,0 +1,383 @@ +//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MachO.h" + +using namespace llvm; + +namespace { +class PPCMachObjectWriter : public MCMachObjectTargetWriter { + bool recordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + unsigned Log2Size, uint64_t &FixedValue); + + void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); + +public: + PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {} + + void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override { + if (Writer->is64Bit()) { + report_fatal_error("Relocation emission for MachO/PPC64 unimplemented."); + } else + RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } +}; +} + +/// computes the log2 of the size of the relocation, +/// used for relocation_info::r_length. +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + report_fatal_error("log2size(FixupKind): Unhandled fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: + return 0; + case FK_PCRel_2: + case FK_Data_2: + return 1; + case FK_PCRel_4: + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_br24: + case FK_Data_4: + return 2; + case FK_PCRel_8: + case FK_Data_8: + return 3; + } + return 0; +} + +/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum. +/// Outline based on PPCELFObjectWriter::GetRelocType(). +static unsigned getRelocType(const MCValue &Target, + const MCFixupKind FixupKind, // from + // Fixup.getKind() + const bool IsPCRel) { + const MCSymbolRefExpr::VariantKind Modifier = + Target.isAbsolute() ? MCSymbolRefExpr::VK_None + : Target.getSymA()->getKind(); + // determine the type of the relocation + unsigned Type = MachO::GENERIC_RELOC_VANILLA; + if (IsPCRel) { // relative to PC + switch ((unsigned)FixupKind) { + default: + report_fatal_error("Unimplemented fixup kind (relative)"); + case PPC::fixup_ppc_br24: + Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24 + break; + case PPC::fixup_ppc_brcond14: + Type = MachO::PPC_RELOC_BR14; + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: + llvm_unreachable("Unsupported modifier for half16 fixup"); + case MCSymbolRefExpr::VK_PPC_HA: + Type = MachO::PPC_RELOC_HA16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = MachO::PPC_RELOC_LO16; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = MachO::PPC_RELOC_HI16; + break; + } + break; + } + } else { + switch ((unsigned)FixupKind) { + default: + report_fatal_error("Unimplemented fixup kind (absolute)!"); + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: + llvm_unreachable("Unsupported modifier for half16 fixup"); + case MCSymbolRefExpr::VK_PPC_HA: + Type = MachO::PPC_RELOC_HA16_SECTDIFF; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = MachO::PPC_RELOC_LO16_SECTDIFF; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = MachO::PPC_RELOC_HI16_SECTDIFF; + break; + } + break; + case FK_Data_4: + break; + case FK_Data_2: + break; + } + } + return Type; +} + +static void makeRelocationInfo(MachO::any_relocation_info &MRE, + const uint32_t FixupOffset, const uint32_t Index, + const unsigned IsPCRel, const unsigned Log2Size, + const unsigned IsExtern, const unsigned Type) { + MRE.r_word0 = FixupOffset; + // The bitfield offsets that work (as determined by trial-and-error) + // are different than what is documented in the mach-o manuals. + // This appears to be an endianness issue; reversing the order of the + // documented bitfields in <llvm/Support/MachO.h> fixes this (but + // breaks x86/ARM assembly). + MRE.r_word1 = ((Index << 8) | // was << 0 + (IsPCRel << 7) | // was << 24 + (Log2Size << 5) | // was << 25 + (IsExtern << 4) | // was << 27 + (Type << 0)); // was << 28 +} + +static void +makeScatteredRelocationInfo(MachO::any_relocation_info &MRE, + const uint32_t Addr, const unsigned Type, + const unsigned Log2Size, const unsigned IsPCRel, + const uint32_t Value2) { + // For notes on bitfield positions and endianness, see: + // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry + MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) | + (IsPCRel << 30) | MachO::R_SCATTERED); + MRE.r_word1 = Value2; +} + +/// Compute fixup offset (address). +static uint32_t getFixupOffset(const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + // On Mach-O, ppc_fixup_half16 relocations must refer to the + // start of the instruction, not the second halfword, as ELF does + if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16) + FixupOffset &= ~uint32_t(3); + return FixupOffset; +} + +/// \return false if falling back to using non-scattered relocation, +/// otherwise true for normal scattered relocation. +/// based on X86MachObjectWriter::recordScatteredRelocation +/// and ARMMachObjectWriter::recordScatteredRelocation +bool PPCMachObjectWriter::recordScatteredRelocation( + MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Log2Size, uint64_t &FixedValue) { + // caller already computes these, can we just pass and reuse? + const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup); + const MCFixupKind FK = Fixup.getKind(); + const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK); + const unsigned Type = getRelocType(Target, FK, IsPCRel); + + // Is this a local or SECTDIFF relocation entry? + // SECTDIFF relocation entries have symbol subtractions, + // and require two entries, the first for the add-symbol value, + // the second for the subtract-symbol value. + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + + if (!A->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(*A, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + const MCSymbol *SB = &B->getSymbol(); + + if (!SB->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // FIXME: is Type correct? see include/llvm/Support/MachO.h + Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); + } + // FIXME: does FixedValue get used?? + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == MachO::PPC_RELOC_SECTDIFF || + Type == MachO::PPC_RELOC_HI16_SECTDIFF || + Type == MachO::PPC_RELOC_LO16_SECTDIFF || + Type == MachO::PPC_RELOC_HA16_SECTDIFF || + Type == MachO::PPC_RELOC_LO14_SECTDIFF || + Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) { + // X86 had this piece, but ARM does not + // If the offset is too large to fit in a scattered relocation, + // we're hosed. It's an unfortunate limitation of the MachO format. + if (FixupOffset > 0xffffff) { + char Buffer[32]; + format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); + Asm.getContext().reportError(Fixup.getLoc(), + Twine("Section too large, can't encode " + "r_address (") + + Buffer + ") into 24 bits of scattered " + "relocation entry."); + return false; + } + + // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()? + // see PPCMCExpr::evaluateAsRelocatableImpl() + uint32_t other_half = 0; + switch (Type) { + case MachO::PPC_RELOC_LO16_SECTDIFF: + other_half = (FixedValue >> 16) & 0xffff; + // applyFixupOffset longer extracts the high part because it now assumes + // this was already done. + // It looks like this is not true for the FixedValue needed with Mach-O + // relocs. + // So we need to adjust FixedValue again here. + FixedValue &= 0xffff; + break; + case MachO::PPC_RELOC_HA16_SECTDIFF: + other_half = FixedValue & 0xffff; + FixedValue = + ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff; + break; + case MachO::PPC_RELOC_HI16_SECTDIFF: + other_half = FixedValue & 0xffff; + FixedValue = (FixedValue >> 16) & 0xffff; + break; + default: + llvm_unreachable("Invalid PPC scattered relocation type."); + break; + } + + MachO::any_relocation_info MRE; + makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR, + Log2Size, IsPCRel, Value2); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + } else { + // If the offset is more than 24-bits, it won't fit in a scattered + // relocation offset field, so we fall back to using a non-scattered + // relocation. This is a bit risky, as if the offset reaches out of + // the block and the linker is doing scattered loading on this + // symbol, things can go badly. + // + // Required for 'as' compatibility. + if (FixupOffset > 0xffffff) + return false; + } + MachO::any_relocation_info MRE; + makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + return true; +} + +// see PPCELFObjectWriter for a general outline of cases +void PPCMachObjectWriter::RecordPPCRelocation( + MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + const MCFixupKind FK = Fixup.getKind(); // unsigned + const unsigned Log2Size = getFixupKindLog2Size(FK); + const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK); + const unsigned RelocType = getRelocType(Target, FK, IsPCRel); + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB() && + // Q: are branch targets ever scattered? + RelocType != MachO::PPC_RELOC_BR24 && + RelocType != MachO::PPC_RELOC_BR14) { + recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + Log2Size, FixedValue); + return; + } + + // this doesn't seem right for RIT_PPC_BR24 + // Get the symbol data, if any. + const MCSymbol *A = nullptr; + if (Target.getSymA()) + A = &Target.getSymA()->getSymbol(); + + // See <reloc.h>. + const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup); + unsigned Index = 0; + unsigned Type = RelocType; + + const MCSymbol *RelSymbol = nullptr; + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + // + // FIXME: Currently, these are never generated (see code below). I cannot + // find a case where they are actually emitted. + report_fatal_error("FIXME: relocations to absolute targets " + "not yet implemented"); + // the above line stolen from ARM, not sure + } else { + // Resolve constant variables. + if (A->isVariable()) { + int64_t Res; + if (A->getVariableValue()->evaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(*A)) { + RelSymbol = A; + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!A->isUndefined()) + FixedValue -= Layout.getSymbolOffset(*A); + } else { + // The index is the section ordinal (1-based). + const MCSection &Sec = A->getSection(); + Index = Sec.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&Sec); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + } + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter( + new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS, + /*IsLittleEndian=*/false); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp new file mode 100644 index 0000000..c2987b6 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp @@ -0,0 +1,86 @@ +//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#include "PPCPredicates.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +using namespace llvm; + +PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) { + switch (Opcode) { + case PPC::PRED_EQ: return PPC::PRED_NE; + case PPC::PRED_NE: return PPC::PRED_EQ; + case PPC::PRED_LT: return PPC::PRED_GE; + case PPC::PRED_GE: return PPC::PRED_LT; + case PPC::PRED_GT: return PPC::PRED_LE; + case PPC::PRED_LE: return PPC::PRED_GT; + case PPC::PRED_NU: return PPC::PRED_UN; + case PPC::PRED_UN: return PPC::PRED_NU; + case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS; + case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS; + case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS; + case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS; + case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS; + case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS; + case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS; + case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS; + case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS; + case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS; + case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS; + case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS; + case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS; + case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS; + case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS; + case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS; + + // Simple predicates for single condition-register bits. + case PPC::PRED_BIT_SET: return PPC::PRED_BIT_UNSET; + case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET; + } + llvm_unreachable("Unknown PPC branch opcode!"); +} + +PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) { + switch (Opcode) { + case PPC::PRED_EQ: return PPC::PRED_EQ; + case PPC::PRED_NE: return PPC::PRED_NE; + case PPC::PRED_LT: return PPC::PRED_GT; + case PPC::PRED_GE: return PPC::PRED_LE; + case PPC::PRED_GT: return PPC::PRED_LT; + case PPC::PRED_LE: return PPC::PRED_GE; + case PPC::PRED_NU: return PPC::PRED_NU; + case PPC::PRED_UN: return PPC::PRED_UN; + case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS; + case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS; + case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS; + case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS; + case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS; + case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS; + case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS; + case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS; + case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS; + case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS; + case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS; + case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS; + case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS; + case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS; + case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS; + case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS; + + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Unknown PPC branch opcode!"); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h new file mode 100644 index 0000000..acea600 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -0,0 +1,76 @@ +//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H +#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +// Generated files will use "namespace PPC". To avoid symbol clash, +// undefine PPC here. PPC may be predefined on some hosts. +#undef PPC + +namespace llvm { +namespace PPC { + /// Predicate - These are "(BI << 5) | BO" for various predicates. + enum Predicate { + PRED_LT = (0 << 5) | 12, + PRED_LE = (1 << 5) | 4, + PRED_EQ = (2 << 5) | 12, + PRED_GE = (0 << 5) | 4, + PRED_GT = (1 << 5) | 12, + PRED_NE = (2 << 5) | 4, + PRED_UN = (3 << 5) | 12, + PRED_NU = (3 << 5) | 4, + PRED_LT_MINUS = (0 << 5) | 14, + PRED_LE_MINUS = (1 << 5) | 6, + PRED_EQ_MINUS = (2 << 5) | 14, + PRED_GE_MINUS = (0 << 5) | 6, + PRED_GT_MINUS = (1 << 5) | 14, + PRED_NE_MINUS = (2 << 5) | 6, + PRED_UN_MINUS = (3 << 5) | 14, + PRED_NU_MINUS = (3 << 5) | 6, + PRED_LT_PLUS = (0 << 5) | 15, + PRED_LE_PLUS = (1 << 5) | 7, + PRED_EQ_PLUS = (2 << 5) | 15, + PRED_GE_PLUS = (0 << 5) | 7, + PRED_GT_PLUS = (1 << 5) | 15, + PRED_NE_PLUS = (2 << 5) | 7, + PRED_UN_PLUS = (3 << 5) | 15, + PRED_NU_PLUS = (3 << 5) | 7, + + // When dealing with individual condition-register bits, we have simple set + // and unset predicates. + PRED_BIT_SET = 1024, + PRED_BIT_UNSET = 1025 + }; + + // Bit for branch taken (plus) or not-taken (minus) hint + enum BranchHintBit { + BR_NO_HINT = 0x0, + BR_NONTAKEN_HINT = 0x2, + BR_TAKEN_HINT = 0x3, + BR_HINT_MASK = 0X3 + }; + + /// Invert the specified predicate. != -> ==, < -> >=. + Predicate InvertPredicate(Predicate Opcode); + + /// Assume the condition register is set by MI(a,b), return the predicate if + /// we modify the instructions such that condition register is set by MI(b,a). + Predicate getSwappedPredicate(Predicate Opcode); +} +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h new file mode 100644 index 0000000..a259ed3 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -0,0 +1,106 @@ +//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// PowerPC back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPC_H +#define LLVM_LIB_TARGET_POWERPC_PPC_H + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include <string> + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { + class PPCTargetMachine; + class PassRegistry; + class FunctionPass; + class ImmutablePass; + class MachineInstr; + class AsmPrinter; + class MCInst; + + FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM); +#ifndef NDEBUG + FunctionPass *createPPCCTRLoopsVerify(); +#endif + FunctionPass *createPPCLoopDataPrefetchPass(); + FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM); + FunctionPass *createPPCTOCRegDepsPass(); + FunctionPass *createPPCEarlyReturnPass(); + FunctionPass *createPPCVSXCopyPass(); + FunctionPass *createPPCVSXFMAMutatePass(); + FunctionPass *createPPCVSXSwapRemovalPass(); + FunctionPass *createPPCMIPeepholePass(); + FunctionPass *createPPCBranchSelectionPass(); + FunctionPass *createPPCISelDag(PPCTargetMachine &TM); + FunctionPass *createPPCTLSDynamicCallPass(); + FunctionPass *createPPCBoolRetToIntPass(); + void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + AsmPrinter &AP, bool isDarwin); + + void initializePPCVSXFMAMutatePass(PassRegistry&); + void initializePPCBoolRetToIntPass(PassRegistry&); + extern char &PPCVSXFMAMutateID; + + namespace PPCII { + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // PPC Specific MachineOperand flags. + MO_NO_FLAG, + + /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is + /// used for calls and jumps to external functions on Tiger and earlier, and + /// for PIC calls on Linux and ELF systems. + MO_PLT_OR_STUB = 1, + + /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to + /// the function's picbase, e.g. lo16(symbol-picbase). + MO_PIC_FLAG = 2, + + /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to + /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase). + MO_NLP_FLAG = 4, + + /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a + /// symbol with hidden visibility. This causes a different kind of + /// non-lazy-pointer to be generated. + MO_NLP_HIDDEN_FLAG = 8, + + /// The next are not flags but distinct values. + MO_ACCESS_MASK = 0xf0, + + /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol) + MO_LO = 1 << 4, + MO_HA = 2 << 4, + + MO_TPREL_LO = 4 << 4, + MO_TPREL_HA = 3 << 4, + + /// These values identify relocations on immediates folded + /// into memory operations. + MO_DTPREL_LO = 5 << 4, + MO_TLSLD_LO = 6 << 4, + MO_TOC_LO = 7 << 4, + + // Symbol for VK_PPC_TLS fixup attached to an ADD instruction + MO_TLS = 8 << 4 + }; + } // end namespace PPCII + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td new file mode 100644 index 0000000..b03be12 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPC.td @@ -0,0 +1,424 @@ +//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the top level entry point for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing. +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// PowerPC Subtarget features. +// + +//===----------------------------------------------------------------------===// +// CPU Directives // +//===----------------------------------------------------------------------===// + +def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">; +def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">; +def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">; +def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">; +def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">; +def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; +def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; +def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; +def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">; +def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E500mc", "">; +def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E5500", "">; +def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">; +def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">; +def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">; +def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">; +def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; +def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">; +def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; +def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; + +def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", + "Enable 64-bit instructions">; +def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software emulation for floating point">; +def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", + "Enable 64-bit registers usage for ppc32 [beta]">; +def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true", + "Use condition-register bits individually">; +def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true", + "Enable Altivec instructions">; +def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true", + "Enable SPE instructions">; +def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true", + "Enable the MFOCRF instruction">; +def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", + "Enable the fsqrt instruction">; +def FeatureFCPSGN : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true", + "Enable the fcpsgn instruction">; +def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true", + "Enable the fre instruction">; +def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true", + "Enable the fres instruction">; +def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true", + "Enable the frsqrte instruction">; +def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true", + "Enable the frsqrtes instruction">; +def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true", + "Assume higher precision reciprocal estimates">; +def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true", + "Enable the stfiwx instruction">; +def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true", + "Enable the lfiwax instruction">; +def FeatureFPRND : SubtargetFeature<"fprnd", "HasFPRND", "true", + "Enable the fri[mnpz] instructions">; +def FeatureFPCVT : SubtargetFeature<"fpcvt", "HasFPCVT", "true", + "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions">; +def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true", + "Enable the isel instruction">; +def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "true", + "Enable the popcnt[dw] instructions">; +def FeatureBPERMD : SubtargetFeature<"bpermd", "HasBPERMD", "true", + "Enable the bpermd instruction">; +def FeatureExtDiv : SubtargetFeature<"extdiv", "HasExtDiv", "true", + "Enable extended divide instructions">; +def FeatureLDBRX : SubtargetFeature<"ldbrx","HasLDBRX", "true", + "Enable the ldbrx instruction">; +def FeatureCMPB : SubtargetFeature<"cmpb", "HasCMPB", "true", + "Enable the cmpb instruction">; +def FeatureICBT : SubtargetFeature<"icbt","HasICBT", "true", + "Enable icbt instruction">; +def FeatureBookE : SubtargetFeature<"booke", "IsBookE", "true", + "Enable Book E instructions", + [FeatureICBT]>; +def FeatureMSYNC : SubtargetFeature<"msync", "HasOnlyMSYNC", "true", + "Has only the msync instruction instead of sync", + [FeatureBookE]>; +def FeatureE500 : SubtargetFeature<"e500", "IsE500", "true", + "Enable E500/E500mc instructions">; +def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", + "Enable PPC 4xx instructions">; +def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", + "Enable PPC 6xx instructions">; +def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", + "Enable QPX instructions">; +def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", + "Enable VSX instructions", + [FeatureAltivec]>; +def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true", + "Enable POWER8 Altivec instructions", + [FeatureAltivec]>; +def FeatureP8Crypto : SubtargetFeature<"crypto", "HasP8Crypto", "true", + "Enable POWER8 Crypto instructions", + [FeatureP8Altivec]>; +def FeatureP8Vector : SubtargetFeature<"power8-vector", "HasP8Vector", "true", + "Enable POWER8 vector instructions", + [FeatureVSX, FeatureP8Altivec]>; +def FeatureDirectMove : + SubtargetFeature<"direct-move", "HasDirectMove", "true", + "Enable Power8 direct move instructions", + [FeatureVSX]>; +def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics", + "HasPartwordAtomics", "true", + "Enable l[bh]arx and st[bh]cx.">; +def FeatureInvariantFunctionDescriptors : + SubtargetFeature<"invariant-function-descriptors", + "HasInvariantFunctionDescriptors", "true", + "Assume function descriptors are invariant">; +def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", + "Enable Hardware Transactional Memory instructions">; +def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", + "Implement mftb using the mfspr instruction">; +def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true", + "Target supports add/load integer fusion.">; +def FeatureFloat128 : + SubtargetFeature<"float128", "HasFloat128", "true", + "Enable the __float128 data type for IEEE-754R Binary128.", + [FeatureVSX]>; + +def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true", + "Treat vector data stream cache control instructions as deprecated">; + +/* Since new processors generally contain a superset of features of those that + came before them, the idea is to make implementations of new processors + less error prone and easier to read. + Namely: + list<SubtargetFeature> Power8FeatureList = ... + list<SubtargetFeature> FutureProcessorSpecificFeatureList = + [ features that Power8 does not support ] + list<SubtargetFeature> FutureProcessorFeatureList = + !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList) + + Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as + well as providing a single point of definition if the feature set will be + used elsewhere. +*/ +def ProcessorFeatures { + list<SubtargetFeature> Power7FeatureList = + [DirectivePwr7, FeatureAltivec, FeatureVSX, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, + FeatureBPERMD, FeatureExtDiv, + FeatureMFTB, DeprecatedDST]; + list<SubtargetFeature> Power8SpecificFeatures = + [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, + FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, + FeatureFusion]; + list<SubtargetFeature> Power8FeatureList = + !listconcat(Power7FeatureList, Power8SpecificFeatures); +} + +// Note: Future features to add when support is extended to more +// recent ISA levels: +// +// DFP p6, p6x, p7 decimal floating-point instructions +// POPCNTB p5 through p7 popcntb and related instructions + +//===----------------------------------------------------------------------===// +// Classes used for relation maps. +//===----------------------------------------------------------------------===// +// RecFormRel - Filter class used to relate non-record-form instructions with +// their record-form variants. +class RecFormRel; + +// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX +// FMA instruction forms with their corresponding factor-killing forms. +class AltVSXFMARel { + bit IsVSXFMAAlt = 0; +} + +//===----------------------------------------------------------------------===// +// Relation Map Definitions. +//===----------------------------------------------------------------------===// + +def getRecordFormOpcode : InstrMapping { + let FilterClass = "RecFormRel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName", "Interpretation64Bit"]; + // Instructions with the same RC value form a column. + let ColFields = ["RC"]; + // The key column are the non-record-form instructions. + let KeyCol = ["0"]; + // Value columns RC=1 + let ValueCols = [["1"]]; +} + +def getNonRecordFormOpcode : InstrMapping { + let FilterClass = "RecFormRel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName", "Interpretation64Bit"]; + // Instructions with the same RC value form a column. + let ColFields = ["RC"]; + // The key column are the record-form instructions. + let KeyCol = ["1"]; + // Value columns are RC=0 + let ValueCols = [["0"]]; +} + +def getAltVSXFMAOpcode : InstrMapping { + let FilterClass = "AltVSXFMARel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName"]; + // Instructions with the same RC value form a column. + let ColFields = ["IsVSXFMAAlt"]; + // The key column are the (default) addend-killing instructions. + let KeyCol = ["0"]; + // Value columns IsVSXFMAAlt=1 + let ValueCols = [["1"]]; +} + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "PPCRegisterInfo.td" +include "PPCSchedule.td" +include "PPCInstrInfo.td" + +//===----------------------------------------------------------------------===// +// PowerPC processors supported. +// + +def : Processor<"generic", G3Itineraries, [Directive32, FeatureMFTB]>; +def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, + FeatureICBT, FeatureBookE, + FeatureMSYNC, FeatureMFTB]>; +def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, + FeatureICBT, FeatureBookE, + FeatureMSYNC, FeatureMFTB]>; +def : Processor<"601", G3Itineraries, [Directive601]>; +def : Processor<"602", G3Itineraries, [Directive602, + FeatureMFTB]>; +def : Processor<"603", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"603e", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"603ev", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"604", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"604e", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"620", G3Itineraries, [Directive620, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"750", G4Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"g3", G3Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; +def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; + +def : ProcessorModel<"970", G5Model, + [Directive970, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, + FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB]>; +def : ProcessorModel<"g5", G5Model, + [Directive970, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + FeatureFRES, FeatureFRSQRTE, + Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB, DeprecatedDST]>; +def : ProcessorModel<"e500mc", PPCE500mcModel, + [DirectiveE500mc, + FeatureSTFIWX, FeatureICBT, FeatureBookE, + FeatureISEL, FeatureMFTB]>; +def : ProcessorModel<"e5500", PPCE5500Model, + [DirectiveE5500, FeatureMFOCRF, Feature64Bit, + FeatureSTFIWX, FeatureICBT, FeatureBookE, + FeatureISEL, FeatureMFTB]>; +def : ProcessorModel<"a2", PPCA2Model, + [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit + /*, Feature64BitRegs */, FeatureMFTB]>; +def : ProcessorModel<"a2q", PPCA2Model, + [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit + /*, Feature64BitRegs */, FeatureQPX, FeatureMFTB]>; +def : ProcessorModel<"pwr3", G5Model, + [DirectivePwr3, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, + FeatureSTFIWX, Feature64Bit]>; +def : ProcessorModel<"pwr4", G5Model, + [DirectivePwr4, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, + FeatureSTFIWX, Feature64Bit, FeatureMFTB]>; +def : ProcessorModel<"pwr5", G5Model, + [DirectivePwr5, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, Feature64Bit, + FeatureMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr5x", G5Model, + [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, FeatureFPRND, Feature64Bit, + FeatureMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr6", G5Model, + [DirectivePwr6, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB, + FeatureFPRND, Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr6x", G5Model, + [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB, + FeatureFPRND, Feature64Bit, + FeatureMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>; +def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>; +def : Processor<"ppc", G3Itineraries, [Directive32, FeatureMFTB]>; +def : ProcessorModel<"ppc64", G5Model, + [Directive64, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureFRES, + FeatureFRSQRTE, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB]>; +def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "PPCCallingConv.td" + +def PPCInstrInfo : InstrInfo { + let isLittleEndianEncoding = 1; + + // FIXME: Unset this when no longer needed! + let decodePositionallyEncodedOperands = 1; + + let noNamedPositionallyEncodedOperands = 1; +} + +def PPCAsmParser : AsmParser { + let ShouldEmitMatchRegisterName = 0; +} + +def PPCAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // We do not use hard coded registers in asm strings. However, some + // InstAlias definitions use immediate literals. Set RegisterPrefix + // so that those are not misinterpreted as registers. + string RegisterPrefix = "%"; + string BreakCharacters = "."; +} + +def PPC : Target { + // Information about the instructions. + let InstructionSet = PPCInstrInfo; + + let AssemblyParsers = [PPCAsmParser]; + let AssemblyParserVariants = [PPCAsmParserVariant]; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp new file mode 100644 index 0000000..9a63c14 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -0,0 +1,1565 @@ +//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to PowerPC assembly language. This printer is +// the output mechanism used by `llc'. +// +// Documentation at http://developer.apple.com/documentation/DeveloperTools/ +// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "InstPrinter/PPCInstPrinter.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "PPCTargetStreamer.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "asmprinter" + +namespace { +class PPCAsmPrinter : public AsmPrinter { +protected: + MapVector<MCSymbol *, MCSymbol *> TOC; + const PPCSubtarget *Subtarget; + StackMaps SM; + +public: + explicit PPCAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} + + const char *getPassName() const override { + return "PowerPC Assembly Printer"; + } + + MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym); + + void EmitInstruction(const MachineInstr *MI) override; + + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + + void EmitEndOfAsmFile(Module &M) override; + + void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI); + void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); + void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); + bool runOnMachineFunction(MachineFunction &MF) override { + Subtarget = &MF.getSubtarget<PPCSubtarget>(); + return AsmPrinter::runOnMachineFunction(MF); + } + }; + + /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux + class PPCLinuxAsmPrinter : public PPCAsmPrinter { + public: + explicit PPCLinuxAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : PPCAsmPrinter(TM, std::move(Streamer)) {} + + const char *getPassName() const override { + return "Linux PPC Assembly Printer"; + } + + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; + + void EmitFunctionEntryLabel() override; + + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + }; + + /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac + /// OS X + class PPCDarwinAsmPrinter : public PPCAsmPrinter { + public: + explicit PPCDarwinAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : PPCAsmPrinter(TM, std::move(Streamer)) {} + + const char *getPassName() const override { + return "Darwin PPC Assembly Printer"; + } + + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; + + void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs); + }; +} // end of anonymous namespace + +/// stripRegisterPrefix - This method strips the character prefix from a +/// register name so that only the number is left. Used by for linux asm. +static const char *stripRegisterPrefix(const char *RegName) { + switch (RegName[0]) { + case 'r': + case 'f': + case 'q': // for QPX + case 'v': + if (RegName[1] == 's') + return RegName + 2; + return RegName + 1; + case 'c': if (RegName[1] == 'r') return RegName + 2; + } + + return RegName; +} + +void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const DataLayout &DL = getDataLayout(); + const MachineOperand &MO = MI->getOperand(OpNo); + + switch (MO.getType()) { + case MachineOperand::MO_Register: { + const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg()); + // Linux assembler (Others?) does not take register mnemonics. + // FIXME - What about special registers used in mfspr/mtspr? + if (!Subtarget->isDarwin()) + RegName = stripRegisterPrefix(RegName); + O << RegName; + return; + } + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return; + + case MachineOperand::MO_MachineBasicBlock: + MO.getMBB()->getSymbol()->print(O, MAI); + return; + case MachineOperand::MO_ConstantPoolIndex: + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' + << MO.getIndex(); + return; + case MachineOperand::MO_BlockAddress: + GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); + return; + case MachineOperand::MO_GlobalAddress: { + // Computing the address of a global symbol, not calling it. + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *SymToPrint; + + // External or weakly linked global variables need non-lazily-resolved stubs + if (TM.getRelocationModel() != Reloc::Static && + !GV->isStrongDefinitionForLinker()) { + if (!GV->hasHiddenVisibility()) { + SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry( + SymToPrint); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); + } else if (GV->isDeclaration() || GV->hasCommonLinkage() || + GV->hasAvailableExternallyLinkage()) { + SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry( + SymToPrint); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); + } else { + SymToPrint = getSymbol(GV); + } + } else { + SymToPrint = getSymbol(GV); + } + + SymToPrint->print(O, MAI); + + printOffset(MO.getOffset(), O); + return; + } + + default: + O << "<unknown operand type: " << (unsigned)MO.getType() << ">"; + return; + } +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'c': // Don't print "$" before a global var name or constant. + break; // PPC never has a prefix. + case 'L': // Write second word of DImode reference. + // Verify that this operand has two consecutive registers. + if (!MI->getOperand(OpNo).isReg() || + OpNo+1 == MI->getNumOperands() || + !MI->getOperand(OpNo+1).isReg()) + return true; + ++OpNo; // Return the high-part. + break; + case 'I': + // Write 'i' if an integer constant, otherwise nothing. Used to print + // addi vs add, etc. + if (MI->getOperand(OpNo).isImm()) + O << "i"; + return false; + } + } + + printOperand(MI, OpNo, O); + return false; +} + +// At the moment, all inline asm memory operands are a single register. +// In any case, the output of this routine should always be just one +// assembler operand. + +bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'y': // A memory reference for an X-form instruction + { + const char *RegName = "r0"; + if (!Subtarget->isDarwin()) + RegName = stripRegisterPrefix(RegName); + O << RegName << ", "; + printOperand(MI, OpNo, O); + return false; + } + case 'U': // Print 'u' for update form. + case 'X': // Print 'x' for indexed form. + { + // FIXME: Currently for PowerPC memory operands are always loaded + // into a register, so we never get an update or indexed form. + // This is bad even for offset forms, since even if we know we + // have a value in -16(r1), we will generate a load into r<n> + // and then load from 0(r<n>). Until that issue is fixed, + // tolerate 'U' and 'X' but don't output anything. + assert(MI->getOperand(OpNo).isReg()); + return false; + } + } + } + + assert(MI->getOperand(OpNo).isReg()); + O << "0("; + printOperand(MI, OpNo, O); + O << ")"; + return false; +} + +/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry +/// exists for it. If not, create one. Then return a symbol that references +/// the TOC entry. +MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) { + MCSymbol *&TOCEntry = TOC[Sym]; + if (!TOCEntry) + TOCEntry = createTempSymbol("C"); + return TOCEntry; +} + +void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) { + SM.serializeToStackMapSection(); +} + +void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { + unsigned NumNOPBytes = MI.getOperand(1).getImm(); + + SM.recordStackMap(MI); + assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + + // Scan ahead to trim the shadow. + const MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::const_iterator MII(MI); + ++MII; + while (NumNOPBytes > 0) { + if (MII == MBB.end() || MII->isCall() || + MII->getOpcode() == PPC::DBG_VALUE || + MII->getOpcode() == TargetOpcode::PATCHPOINT || + MII->getOpcode() == TargetOpcode::STACKMAP) + break; + ++MII; + NumNOPBytes -= 4; + } + + // Emit nops. + for (unsigned i = 0; i < NumNOPBytes; i += 4) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); +} + +// Lower a patchpoint of the form: +// [<def>], <id>, <numBytes>, <target>, <numArgs> +void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { + SM.recordPatchPoint(MI); + PatchPointOpers Opers(&MI); + + unsigned EncodedBytes = 0; + const MachineOperand &CalleeMO = + Opers.getMetaOper(PatchPointOpers::TargetPos); + + if (CalleeMO.isImm()) { + int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + if (CallTarget) { + assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && + "High 16 bits of call target should be zero."); + unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + EncodedBytes = 0; + // Materialize the jump address: + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8) + .addReg(ScratchReg) + .addImm((CallTarget >> 32) & 0xFFFF)); + ++EncodedBytes; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(32).addImm(16)); + ++EncodedBytes; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm((CallTarget >> 16) & 0xFFFF)); + ++EncodedBytes; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(CallTarget & 0xFFFF)); + + // Save the current TOC pointer before the remote call. + int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD) + .addReg(PPC::X2) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); + ++EncodedBytes; + + // If we're on ELFv1, then we need to load the actual function pointer + // from the function descriptor. + if (!Subtarget->isELFv2ABI()) { + // Load the new TOC pointer and the function address, but not r11 + // (needing this is rare, and loading it here would prevent passing it + // via a 'nest' parameter. + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addImm(8) + .addReg(ScratchReg)); + ++EncodedBytes; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) + .addReg(ScratchReg) + .addImm(0) + .addReg(ScratchReg)); + ++EncodedBytes; + } + + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8) + .addReg(ScratchReg)); + ++EncodedBytes; + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8)); + ++EncodedBytes; + + // Restore the TOC pointer after the call. + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); + ++EncodedBytes; + } + } else if (CalleeMO.isGlobal()) { + const GlobalValue *GValue = CalleeMO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext); + + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP) + .addExpr(SymVar)); + EncodedBytes += 2; + } + + // Each instruction is 4 bytes. + EncodedBytes *= 4; + + // Emit padding. + unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + assert(NumBytes >= EncodedBytes && + "Patchpoint can't request size less than the length of a call."); + assert((NumBytes - EncodedBytes) % 4 == 0 && + "Invalid number of NOP bytes requested!"); + for (unsigned i = EncodedBytes; i < NumBytes; i += 4) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); +} + +/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a +/// call to __tls_get_addr to the current output stream. +void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, + MCSymbolRefExpr::VariantKind VK) { + StringRef Name = "__tls_get_addr"; + MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name); + MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + + assert(MI->getOperand(0).isReg() && + ((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) || + (!Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::R3)) && + "GETtls[ld]ADDR[32] must define GPR3"); + assert(MI->getOperand(1).isReg() && + ((Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::X3) || + (!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) && + "GETtls[ld]ADDR[32] must read GPR3"); + + if (!Subtarget->isPPC64() && !Subtarget->isDarwin() && + TM.getRelocationModel() == Reloc::PIC_) + Kind = MCSymbolRefExpr::VK_PLT; + const MCSymbolRefExpr *TlsRef = + MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext); + EmitToStreamer(*OutStreamer, + MCInstBuilder(Subtarget->isPPC64() ? + PPC::BL8_NOP_TLS : PPC::BL_TLS) + .addExpr(TlsRef) + .addExpr(SymVar)); +} + +/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to +/// the current output stream. +/// +void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { + MCInst TmpInst; + bool isPPC64 = Subtarget->isPPC64(); + bool isDarwin = TM.getTargetTriple().isOSDarwin(); + const Module *M = MF->getFunction()->getParent(); + PICLevel::Level PL = M->getPICLevel(); + + // Lower multi-instruction pseudo operations. + switch (MI->getOpcode()) { + default: break; + case TargetOpcode::DBG_VALUE: + llvm_unreachable("Should be handled target independently"); + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(SM, *MI); + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(SM, *MI); + + case PPC::MoveGOTtoLR: { + // Transform %LR = MoveGOTtoLR + // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4 + // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding + // _GLOBAL_OFFSET_TABLE_) has exactly one instruction: + // blrl + // This will return the pointer to _GLOBAL_OFFSET_TABLE_@local + MCSymbol *GOTSymbol = + OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + const MCExpr *OffsExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, + MCSymbolRefExpr::VK_PPC_LOCAL, + OutContext), + MCConstantExpr::create(4, OutContext), + OutContext); + + // Emit the 'bl'. + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL).addExpr(OffsExpr)); + return; + } + case PPC::MovePCtoLR: + case PPC::MovePCtoLR8: { + // Transform %LR = MovePCtoLR + // Into this, where the label is the PIC base: + // bl L1$pb + // L1$pb: + MCSymbol *PICBase = MF->getPICBaseSymbol(); + + // Emit the 'bl'. + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we + // don't have to do a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + + // Emit the label. + OutStreamer->EmitLabel(PICBase); + return; + } + case PPC::UpdateGBR: { + // Transform %Rd = UpdateGBR(%Rt, %Ri) + // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri) + // add %Rd, %Rt, %Ri + // Get the offset from the GOT Base Register to the GOT + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + MCSymbol *PICOffset = + MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol(); + TmpInst.setOpcode(PPC::LWZ); + const MCExpr *Exp = + MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext); + const MCExpr *PB = + MCSymbolRefExpr::create(MF->getPICBaseSymbol(), + MCSymbolRefExpr::VK_None, + OutContext); + const MCOperand TR = TmpInst.getOperand(1); + const MCOperand PICR = TmpInst.getOperand(0); + + // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri) + TmpInst.getOperand(1) = + MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext)); + TmpInst.getOperand(0) = TR; + TmpInst.getOperand(2) = PICR; + EmitToStreamer(*OutStreamer, TmpInst); + + TmpInst.setOpcode(PPC::ADD4); + TmpInst.getOperand(0) = PICR; + TmpInst.getOperand(1) = TR; + TmpInst.getOperand(2) = PICR; + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::LWZtoc: { + // Transform %R3 = LWZtoc <ga:@min1>, %R2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to LWZ, and the global address operand to be a + // reference to the GOT entry we will synthesize later. + TmpInst.setOpcode(PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + else if (MO.isBlockAddress()) + MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + + if (PL == PICLevel::Small) { + const MCExpr *Exp = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT, + OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + } else { + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, + OutContext); + const MCExpr *PB = + MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")), + OutContext); + Exp = MCBinaryExpr::createSub(Exp, PB, OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + } + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::LDtocJTI: + case PPC::LDtocCPT: + case PPC::LDtocBA: + case PPC::LDtoc: { + // Transform %X3 = LDtoc <ga:@min1>, %X2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to LD, and the global address operand to be a + // reference to the TOC entry we will synthesize later. + TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + else if (MO.isBlockAddress()) + MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC, + OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + + case PPC::ADDIStocHA: { + // Transform %Xd = ADDIStocHA %X2, <ga:@sym> + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to ADDIS8. If the global address is external, has + // common linkage, is a non-local function address, or is a jump table + // address, then generate a TOC entry and reference that. Otherwise + // reference the symbol directly. + TmpInst.setOpcode(PPC::ADDIS8); + const MachineOperand &MO = MI->getOperand(2); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || + MO.isBlockAddress()) && + "Invalid operand for ADDIStocHA!"); + MCSymbol *MOSymbol = nullptr; + bool GlobalToc = false; + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG); + } else if (MO.isCPI()) { + MOSymbol = GetCPISymbol(MO.getIndex()); + } else if (MO.isJTI()) { + MOSymbol = GetJTISymbol(MO.getIndex()); + } else if (MO.isBlockAddress()) { + MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + } + + if (GlobalToc || MO.isJTI() || MO.isBlockAddress() || + TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA, + OutContext); + TmpInst.getOperand(2) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::LDtocL: { + // Transform %Xd = LDtocL <ga:@sym>, %Xs + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to LD. If the global address is external, has + // common linkage, or is a jump table address, then reference the + // associated TOC entry. Otherwise reference the symbol directly. + TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || + MO.isBlockAddress()) && + "Invalid operand for LDtocL!"); + MCSymbol *MOSymbol = nullptr; + + if (MO.isJTI()) + MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex())); + else if (MO.isBlockAddress()) { + MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } + else if (MO.isCPI()) { + MOSymbol = GetCPISymbol(MO.getIndex()); + if (TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } + else if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert((GVFlags & PPCII::MO_NLP_FLAG) && + "LDtocL used on symbol that could be accessed directly is " + "invalid. Must match ADDIStocHA.")); + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } + + const MCExpr *Exp = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, + OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::ADDItocL: { + // Transform %Xd = ADDItocL %Xs, <ga:@sym> + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to ADDI8. If the global address is external, then + // generate a TOC entry and reference that. Otherwise reference the + // symbol directly. + TmpInst.setOpcode(PPC::ADDI8); + const MachineOperand &MO = MI->getOperand(2); + assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); + MCSymbol *MOSymbol = nullptr; + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert ( + !(GVFlags & PPCII::MO_NLP_FLAG) && + "Interposable definitions must use indirect access.")); + MOSymbol = getSymbol(GV); + } else if (MO.isCPI()) { + MOSymbol = GetCPISymbol(MO.getIndex()); + } + + const MCExpr *Exp = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, + OutContext); + TmpInst.getOperand(2) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case PPC::ADDISgotTprelHA: { + // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTprel = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA, + OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTprel)); + return; + } + case PPC::LDgotTprelL: + case PPC::LDgotTprelL32: { + // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + + // Change the opcode to LD. + TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *Exp = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO, + OutContext); + TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + + case PPC::PPC32PICGOT: { + MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + MCSymbol *GOTRef = OutContext.createTempSymbol(); + MCSymbol *NextInstr = OutContext.createTempSymbol(); + + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we don't have to do + // a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::create(NextInstr, OutContext))); + const MCExpr *OffsExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext), + MCSymbolRefExpr::create(GOTRef, OutContext), + OutContext); + OutStreamer->EmitLabel(GOTRef); + OutStreamer->EmitValue(OffsExpr, 4); + OutStreamer->EmitLabel(NextInstr); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR) + .addReg(MI->getOperand(0).getReg())); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LWZ) + .addReg(MI->getOperand(1).getReg()) + .addImm(0) + .addReg(MI->getOperand(0).getReg())); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD4) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(MI->getOperand(0).getReg())); + return; + } + case PPC::PPC32GOT: { + MCSymbol *GOTSymbol = + OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + const MCExpr *SymGotTlsL = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext); + const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI) + .addReg(MI->getOperand(0).getReg()) + .addExpr(SymGotTlsL)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()) + .addExpr(SymGotTlsHA)); + return; + } + case PPC::ADDIStlsgdHA: { + // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsGD = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA, + OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsGD)); + return; + } + case PPC::ADDItlsgdL: + // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l + case PPC::ADDItlsgdL32: { + // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@got@tlsgd + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create( + MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO + : MCSymbolRefExpr::VK_PPC_GOT_TLSGD, + OutContext); + EmitToStreamer(*OutStreamer, + MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsGD)); + return; + } + case PPC::GETtlsADDR: + // Transform: %X3 = GETtlsADDR %X3, <ga:@sym> + // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd) + case PPC::GETtlsADDR32: { + // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym> + // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT + EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD); + return; + } + case PPC::ADDIStlsldHA: { + // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha + assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsLD = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA, + OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsLD)); + return; + } + case PPC::ADDItlsldL: + // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l + case PPC::ADDItlsldL32: { + // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@got@tlsld + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create( + MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO + : MCSymbolRefExpr::VK_PPC_GOT_TLSLD, + OutContext); + EmitToStreamer(*OutStreamer, + MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsLD)); + return; + } + case PPC::GETtlsldADDR: + // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym> + // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld) + case PPC::GETtlsldADDR32: { + // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym> + // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT + EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD); + return; + } + case PPC::ADDISdtprelHA: + // Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym> + // Into: %Xd = ADDIS8 %Xs, sym@dtprel@ha + case PPC::ADDISdtprelHA32: { + // Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym> + // Into: %Rd = ADDIS %Rs, sym@dtprel@ha + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymDtprel = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA, + OutContext); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymDtprel)); + return; + } + case PPC::ADDIdtprelL: + // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@dtprel@l + case PPC::ADDIdtprelL32: { + // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@dtprel@l + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymDtprel = + MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO, + OutContext); + EmitToStreamer(*OutStreamer, + MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymDtprel)); + return; + } + case PPC::MFOCRF: + case PPC::MFOCRF8: + if (!Subtarget->hasMFOCRF()) { + // Transform: %R3 = MFOCRF %CR7 + // Into: %R3 = MFCR ;; cr7 + unsigned NewOpcode = + MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8; + OutStreamer->AddComment(PPCInstPrinter:: + getRegisterName(MI->getOperand(1).getReg())); + EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode) + .addReg(MI->getOperand(0).getReg())); + return; + } + break; + case PPC::MTOCRF: + case PPC::MTOCRF8: + if (!Subtarget->hasMFOCRF()) { + // Transform: %CR7 = MTOCRF %R3 + // Into: MTCRF mask, %R3 ;; cr7 + unsigned NewOpcode = + MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8; + unsigned Mask = 0x80 >> OutContext.getRegisterInfo() + ->getEncodingValue(MI->getOperand(0).getReg()); + OutStreamer->AddComment(PPCInstPrinter:: + getRegisterName(MI->getOperand(0).getReg())); + EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode) + .addImm(Mask) + .addReg(MI->getOperand(1).getReg())); + return; + } + break; + case PPC::LD: + case PPC::STD: + case PPC::LWA_32: + case PPC::LWA: { + // Verify alignment is legal, so we don't create relocations + // that can't be supported. + // FIXME: This test is currently disabled for Darwin. The test + // suite shows a handful of test cases that fail this check for + // Darwin. Those need to be investigated before this sanity test + // can be enabled for those subtargets. + if (!Subtarget->isDarwin()) { + unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4) + llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); + } + // Now process the instruction normally. + break; + } + } + + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + EmitToStreamer(*OutStreamer, TmpInst); +} + +void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) { + PPCTargetStreamer *TS = + static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); + + if (TS) + TS->emitAbiVersion(2); + } + + if (static_cast<const PPCTargetMachine &>(TM).isPPC64() || + TM.getRelocationModel() != Reloc::PIC_) + return AsmPrinter::EmitStartOfAsmFile(M); + + if (M.getPICLevel() == PICLevel::Small) + return AsmPrinter::EmitStartOfAsmFile(M); + + OutStreamer->SwitchSection(OutContext.getELFSection( + ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC)); + + MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC")); + MCSymbol *CurrentPos = OutContext.createTempSymbol(); + + OutStreamer->EmitLabel(CurrentPos); + + // The GOT pointer points to the middle of the GOT, in order to reference the + // entire 64kB range. 0x8000 is the midpoint. + const MCExpr *tocExpr = + MCBinaryExpr::createAdd(MCSymbolRefExpr::create(CurrentPos, OutContext), + MCConstantExpr::create(0x8000, OutContext), + OutContext); + + OutStreamer->EmitAssignment(TOCSym, tocExpr); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); +} + +void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { + // linux/ppc32 - Normal entry label. + if (!Subtarget->isPPC64() && + (TM.getRelocationModel() != Reloc::PIC_ || + MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small)) + return AsmPrinter::EmitFunctionEntryLabel(); + + if (!Subtarget->isPPC64()) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); + if (PPCFI->usesPICBase()) { + MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(); + MCSymbol *PICBase = MF->getPICBaseSymbol(); + OutStreamer->EmitLabel(RelocSymbol); + + const MCExpr *OffsExpr = + MCBinaryExpr::createSub( + MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")), + OutContext), + MCSymbolRefExpr::create(PICBase, OutContext), + OutContext); + OutStreamer->EmitValue(OffsExpr, 4); + OutStreamer->EmitLabel(CurrentFnSym); + return; + } else + return AsmPrinter::EmitFunctionEntryLabel(); + } + + // ELFv2 ABI - Normal entry label. + if (Subtarget->isELFv2ABI()) + return AsmPrinter::EmitFunctionEntryLabel(); + + // Emit an official procedure descriptor. + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + MCSectionELF *Section = OutStreamer->getContext().getELFSection( + ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + OutStreamer->SwitchSection(Section); + OutStreamer->EmitLabel(CurrentFnSym); + OutStreamer->EmitValueToAlignment(8); + MCSymbol *Symbol1 = CurrentFnSymForSize; + // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function + // entry point. + OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext), + 8 /*size*/); + MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + // Generates a R_PPC64_TOC relocation for TOC base insertion. + OutStreamer->EmitValue( + MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext), + 8/*size*/); + // Emit a null environment pointer. + OutStreamer->EmitIntValue(0, 8 /* size */); + OutStreamer->SwitchSection(Current.first, Current.second); +} + +bool PPCLinuxAsmPrinter::doFinalization(Module &M) { + const DataLayout &DL = getDataLayout(); + + bool isPPC64 = DL.getPointerSizeInBits() == 64; + + PPCTargetStreamer &TS = + static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer()); + + if (!TOC.empty()) { + MCSectionELF *Section; + + if (isPPC64) + Section = OutStreamer->getContext().getELFSection( + ".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + else + Section = OutStreamer->getContext().getELFSection( + ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + OutStreamer->SwitchSection(Section); + + for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(), + E = TOC.end(); I != E; ++I) { + OutStreamer->EmitLabel(I->second); + MCSymbol *S = I->first; + if (isPPC64) + TS.emitTCEntry(*S); + else + OutStreamer->EmitSymbolValue(S, 4); + } + } + + return AsmPrinter::doFinalization(M); +} + +/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2. +void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { + // In the ELFv2 ABI, in functions that use the TOC register, we need to + // provide two entry points. The ABI guarantees that when calling the + // local entry point, r2 is set up by the caller to contain the TOC base + // for this function, and when calling the global entry point, r12 is set + // up by the caller to hold the address of the global entry point. We + // thus emit a prefix sequence along the following lines: + // + // func: + // # global entry point + // addis r2,r12,(.TOC.-func)@ha + // addi r2,r2,(.TOC.-func)@l + // .localentry func, .-func + // # local entry point, followed by function body + // + // This ensures we have r2 set up correctly while executing the function + // body, no matter which entry point is called. + if (Subtarget->isELFv2ABI() + // Only do all that if the function uses r2 in the first place. + && !MF->getRegInfo().use_empty(PPC::X2)) { + + MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(GlobalEntryLabel); + const MCSymbolRefExpr *GlobalEntryLabelExp = + MCSymbolRefExpr::create(GlobalEntryLabel, OutContext); + + MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), + GlobalEntryLabelExp, OutContext); + + const MCExpr *TOCDeltaHi = + PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::X2) + .addReg(PPC::X12) + .addExpr(TOCDeltaHi)); + + const MCExpr *TOCDeltaLo = + PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addExpr(TOCDeltaLo)); + + MCSymbol *LocalEntryLabel = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(LocalEntryLabel); + const MCSymbolRefExpr *LocalEntryLabelExp = + MCSymbolRefExpr::create(LocalEntryLabel, OutContext); + const MCExpr *LocalOffsetExp = + MCBinaryExpr::createSub(LocalEntryLabelExp, + GlobalEntryLabelExp, OutContext); + + PPCTargetStreamer *TS = + static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); + + if (TS) + TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp); + } +} + +/// EmitFunctionBodyEnd - Print the traceback table before the .size +/// directive. +/// +void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() { + // Only the 64-bit target requires a traceback table. For now, + // we only emit the word of zeroes that GDB requires to find + // the end of the function, and zeroes for the eight-byte + // mandatory fields. + // FIXME: We should fill in the eight-byte mandatory fields as described in + // the PPC64 ELF ABI (this is a low-priority item because GDB does not + // currently make use of these fields). + if (Subtarget->isPPC64()) { + OutStreamer->EmitIntValue(0, 4/*size*/); + OutStreamer->EmitIntValue(0, 8/*size*/); + } +} + +void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { + static const char *const CPUDirectives[] = { + "", + "ppc", + "ppc440", + "ppc601", + "ppc602", + "ppc603", + "ppc7400", + "ppc750", + "ppc970", + "ppcA2", + "ppce500mc", + "ppce5500", + "power3", + "power4", + "power5", + "power5x", + "power6", + "power6x", + "power7", + "ppc64", + "ppc64le" + }; + + // Get the numerically largest directive. + // FIXME: How should we merge darwin directives? + unsigned Directive = PPC::DIR_NONE; + for (const Function &F : M) { + const PPCSubtarget &STI = TM.getSubtarget<PPCSubtarget>(F); + unsigned FDir = STI.getDarwinDirective(); + Directive = Directive > FDir ? FDir : STI.getDarwinDirective(); + if (STI.hasMFOCRF() && Directive < PPC::DIR_970) + Directive = PPC::DIR_970; + if (STI.hasAltivec() && Directive < PPC::DIR_7400) + Directive = PPC::DIR_7400; + if (STI.isPPC64() && Directive < PPC::DIR_64) + Directive = PPC::DIR_64; + } + + assert(Directive <= PPC::DIR_64 && "Directive out of range."); + + assert(Directive < array_lengthof(CPUDirectives) && + "CPUDirectives[] might not be up-to-date!"); + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); + TStreamer.emitMachine(CPUDirectives[Directive]); + + // Prime text sections so they are adjacent. This reduces the likelihood a + // large data or debug section causes a branch to exceed 16M limit. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection()); + if (TM.getRelocationModel() == Reloc::PIC_) { + OutStreamer->SwitchSection( + OutContext.getMachOSection("__TEXT", "__picsymbolstub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 32, SectionKind::getText())); + } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) { + OutStreamer->SwitchSection( + OutContext.getMachOSection("__TEXT","__symbol_stub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 16, SectionKind::getText())); + } + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); +} + +static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) { + // Remove $stub suffix, add $lazy_ptr. + StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5); + return Ctx.getOrCreateSymbol(NoStub + "$lazy_ptr"); +} + +static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) { + // Add $tmp suffix to $stub, yielding $stub$tmp. + return Ctx.getOrCreateSymbol(Sym->getName() + "$tmp"); +} + +void PPCDarwinAsmPrinter:: +EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; + + // Construct a local MCSubtargetInfo and shadow EmitToStreamer here. + // This is because the MachineFunction won't exist (but have not yet been + // freed) and since we're at the global level we can use the default + // constructed subtarget. + std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); + auto EmitToStreamer = [&STI] (MCStreamer &S, const MCInst &Inst) { + S.EmitInstruction(Inst, *STI); + }; + + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + + // .lazy_symbol_pointer + MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection(); + + // Output stubs for dynamically-linked functions + if (TM.getRelocationModel() == Reloc::PIC_) { + MCSection *StubSection = OutContext.getMachOSection( + "__TEXT", "__picsymbolstub1", + MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, 32, + SectionKind::getText()); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + OutStreamer->SwitchSection(StubSection); + EmitAlignment(4); + + MCSymbol *Stub = Stubs[i].first; + MCSymbol *RawSym = Stubs[i].second.getPointer(); + MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); + MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext); + + OutStreamer->EmitLabel(Stub); + OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + const MCExpr *Anon = MCSymbolRefExpr::create(AnonSymbol, OutContext); + const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext); + const MCExpr *Sub = + MCBinaryExpr::createSub(LazyPtrExpr, Anon, OutContext); + + // mflr r0 + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0)); + // bcl 20, 31, AnonSymbol + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon)); + OutStreamer->EmitLabel(AnonSymbol); + // mflr r11 + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11)); + // addis r11, r11, ha16(LazyPtr - AnonSymbol) + const MCExpr *SubHa16 = PPCMCExpr::createHa(Sub, true, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::R11) + .addReg(PPC::R11) + .addExpr(SubHa16)); + // mtlr r0 + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0)); + + // ldu r12, lo16(LazyPtr - AnonSymbol)(r11) + // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11) + const MCExpr *SubLo16 = PPCMCExpr::createLo(Sub, true, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU) + .addReg(PPC::R12) + .addExpr(SubLo16).addExpr(SubLo16) + .addReg(PPC::R11)); + // mtctr r12 + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12)); + // bctr + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTR)); + + OutStreamer->SwitchSection(LSPSection); + OutStreamer->EmitLabel(LazyPtr); + OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + MCSymbol *DyldStubBindingHelper = + OutContext.getOrCreateSymbol(StringRef("dyld_stub_binding_helper")); + if (isPPC64) { + // .quad dyld_stub_binding_helper + OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 8); + } else { + // .long dyld_stub_binding_helper + OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4); + } + } + OutStreamer->AddBlankLine(); + return; + } + + MCSection *StubSection = OutContext.getMachOSection( + "__TEXT", "__symbol_stub1", + MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, 16, + SectionKind::getText()); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + MCSymbol *Stub = Stubs[i].first; + MCSymbol *RawSym = Stubs[i].second.getPointer(); + MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); + const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext); + + OutStreamer->SwitchSection(StubSection); + EmitAlignment(4); + OutStreamer->EmitLabel(Stub); + OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + // lis r11, ha16(LazyPtr) + const MCExpr *LazyPtrHa16 = + PPCMCExpr::createHa(LazyPtrExpr, true, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LIS) + .addReg(PPC::R11) + .addExpr(LazyPtrHa16)); + + // ldu r12, lo16(LazyPtr)(r11) + // lwzu r12, lo16(LazyPtr)(r11) + const MCExpr *LazyPtrLo16 = + PPCMCExpr::createLo(LazyPtrExpr, true, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU) + .addReg(PPC::R12) + .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16) + .addReg(PPC::R11)); + + // mtctr r12 + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12)); + // bctr + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTR)); + + OutStreamer->SwitchSection(LSPSection); + OutStreamer->EmitLabel(LazyPtr); + OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + MCSymbol *DyldStubBindingHelper = + OutContext.getOrCreateSymbol(StringRef("dyld_stub_binding_helper")); + if (isPPC64) { + // .quad dyld_stub_binding_helper + OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 8); + } else { + // .long dyld_stub_binding_helper + OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4); + } + } + + OutStreamer->AddBlankLine(); +} + +bool PPCDarwinAsmPrinter::doFinalization(Module &M) { + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; + + // Darwin/PPC always uses mach-o. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList(); + if (!Stubs.empty()) + EmitFunctionStubs(Stubs); + + if (MAI->doesSupportExceptionHandling() && MMI) { + // Add the (possibly multiple) personalities to the set of global values. + // Only referenced functions get into the Personalities list. + for (const Function *Personality : MMI->getPersonalities()) { + if (Personality) { + MCSymbol *NLPSym = + getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMIMacho.getGVStubEntry(NLPSym); + StubSym = + MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true); + } + } + } + + // Output stubs for dynamically-linked functions. + Stubs = MMIMacho.GetGVStubList(); + + // Output macho stubs for external and common global variables. + if (!Stubs.empty()) { + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(isPPC64 ? 3 : 2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer->EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer->EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer->EmitIntValue(0, isPPC64 ? 8 : 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. + OutStreamer->EmitValue(MCSymbolRefExpr::create(MCSym.getPointer(), + OutContext), + isPPC64 ? 8 : 4/*size*/); + } + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer->SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(isPPC64 ? 3 : 2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer->EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer->EmitValue(MCSymbolRefExpr:: + create(Stubs[i].second.getPointer(), + OutContext), + isPPC64 ? 8 : 4/*size*/); + } + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never generates + // code that does this, it is always safe to set. + OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + + return AsmPrinter::doFinalization(M); +} + +/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code +/// for a MachineFunction to the given output stream, in a format that the +/// Darwin assembler can deal with. +/// +static AsmPrinter * +createPPCAsmPrinterPass(TargetMachine &tm, + std::unique_ptr<MCStreamer> &&Streamer) { + if (tm.getTargetTriple().isMacOSX()) + return new PPCDarwinAsmPrinter(tm, std::move(Streamer)); + return new PPCLinuxAsmPrinter(tm, std::move(Streamer)); +} + +// Force static initialization. +extern "C" void LLVMInitializePowerPCAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp new file mode 100644 index 0000000..7920240 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -0,0 +1,253 @@ +//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements converting i1 values to i32 if they could be more +// profitably allocated as GPRs rather than CRs. This pass will become totally +// unnecessary if Register Bank Allocation and Global Instruction Selection ever +// go upstream. +// +// Presently, the pass converts i1 Constants, and Arguments to i32 if the +// transitive closure of their uses includes only PHINodes, CallInsts, and +// ReturnInsts. The rational is that arguments are generally passed and returned +// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will +// actually save casts at the Machine Instruction level. +// +// It might be useful to expand this pass to add bit-wise operations to the list +// of safe transitive closure types. Also, we miss some opportunities when LLVM +// represents logical AND and OR operations with control flow rather than data +// flow. For example by lowering the expression: return (A && B && C) +// +// as: return A ? true : B && C. +// +// There's code in SimplifyCFG that code be used to turn control flow in data +// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so +// this probably isn't good in general, but for the special case of i1, the +// Selects could be further lowered to bit operations that are fast everywhere. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "bool-ret-to-int" + +STATISTIC(NumBoolRetPromotion, + "Number of times a bool feeding a RetInst was promoted to an int"); +STATISTIC(NumBoolCallPromotion, + "Number of times a bool feeding a CallInst was promoted to an int"); +STATISTIC(NumBoolToIntPromotion, + "Total number of times a bool was promoted to an int"); + +class PPCBoolRetToInt : public FunctionPass { + + static SmallPtrSet<Value *, 8> findAllDefs(Value *V) { + SmallPtrSet<Value *, 8> Defs; + SmallVector<Value *, 8> WorkList; + WorkList.push_back(V); + Defs.insert(V); + while (!WorkList.empty()) { + Value *Curr = WorkList.back(); + WorkList.pop_back(); + if (User *CurrUser = dyn_cast<User>(Curr)) + for (auto &Op : CurrUser->operands()) + if (Defs.insert(Op).second) + WorkList.push_back(Op); + } + return Defs; + } + + // Translate a i1 value to an equivalent i32 value: + static Value *translate(Value *V) { + Type *Int32Ty = Type::getInt32Ty(V->getContext()); + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getZExt(C, Int32Ty); + if (PHINode *P = dyn_cast<PHINode>(V)) { + // Temporarily set the operands to 0. We'll fix this later in + // runOnUse. + Value *Zero = Constant::getNullValue(Int32Ty); + PHINode *Q = + PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P); + for (unsigned i = 0; i < P->getNumOperands(); ++i) + Q->addIncoming(Zero, P->getIncomingBlock(i)); + return Q; + } + + Argument *A = dyn_cast<Argument>(V); + Instruction *I = dyn_cast<Instruction>(V); + assert((A || I) && "Unknown value type"); + + auto InstPt = + A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode(); + return new ZExtInst(V, Int32Ty, "", InstPt); + } + + typedef SmallPtrSet<const PHINode *, 8> PHINodeSet; + + // A PHINode is Promotable if: + // 1. Its type is i1 AND + // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic + // AND + // 3. All of its operands are Constant or Argument or + // CallInst or PHINode AND + // 4. All of its PHINode uses are Promotable AND + // 5. All of its PHINode operands are Promotable + static PHINodeSet getPromotablePHINodes(const Function &F) { + PHINodeSet Promotable; + // Condition 1 + for (auto &BB : F) + for (auto &I : BB) + if (const PHINode *P = dyn_cast<PHINode>(&I)) + if (P->getType()->isIntegerTy(1)) + Promotable.insert(P); + + SmallVector<const PHINode *, 8> ToRemove; + for (const auto &P : Promotable) { + // Condition 2 and 3 + auto IsValidUser = [] (const Value *V) -> bool { + return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) || + isa<DbgInfoIntrinsic>(V); + }; + auto IsValidOperand = [] (const Value *V) -> bool { + return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) || + isa<PHINode>(V); + }; + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsValidUser) || + !std::all_of(Operands.begin(), Operands.end(), IsValidOperand)) + ToRemove.push_back(P); + } + + // Iterate to convergence + auto IsPromotable = [&Promotable] (const Value *V) -> bool { + const PHINode *Phi = dyn_cast<PHINode>(V); + return !Phi || Promotable.count(Phi); + }; + while (!ToRemove.empty()) { + for (auto &User : ToRemove) + Promotable.erase(User); + ToRemove.clear(); + + for (const auto &P : Promotable) { + // Condition 4 and 5 + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsPromotable) || + !std::all_of(Operands.begin(), Operands.end(), IsPromotable)) + ToRemove.push_back(P); + } + } + + return Promotable; + } + + typedef DenseMap<Value *, Value *> B2IMap; + + public: + static char ID; + PPCBoolRetToInt() : FunctionPass(ID) { + initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) { + PHINodeSet PromotablePHINodes = getPromotablePHINodes(F); + B2IMap Bool2IntMap; + bool Changed = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (ReturnInst *R = dyn_cast<ReturnInst>(&I)) + if (F.getReturnType()->isIntegerTy(1)) + Changed |= + runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap); + + if (CallInst *CI = dyn_cast<CallInst>(&I)) + for (auto &U : CI->operands()) + if (U->getType()->isIntegerTy(1)) + Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap); + } + } + + return Changed; + } + + static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes, + B2IMap &BoolToIntMap) { + auto Defs = findAllDefs(U); + + // If the values are all Constants or Arguments, don't bother + if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>)) + return false; + + // Presently, we only know how to handle PHINode, Constant, and Arguments. + // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension + // could also be handled in the future. + for (const auto &V : Defs) + if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V)) + return false; + + for (const auto &V : Defs) + if (const PHINode *P = dyn_cast<PHINode>(V)) + if (!PromotablePHINodes.count(P)) + return false; + + if (isa<ReturnInst>(U.getUser())) + ++NumBoolRetPromotion; + if (isa<CallInst>(U.getUser())) + ++NumBoolCallPromotion; + ++NumBoolToIntPromotion; + + for (const auto &V : Defs) + if (!BoolToIntMap.count(V)) + BoolToIntMap[V] = translate(V); + + // Replace the operands of the translated instructions. There were set to + // zero in the translate function. + for (auto &Pair : BoolToIntMap) { + User *First = dyn_cast<User>(Pair.first); + User *Second = dyn_cast<User>(Pair.second); + assert((!First || Second) && "translated from user to non-user!?"); + if (First) + for (unsigned i = 0; i < First->getNumOperands(); ++i) + Second->setOperand(i, BoolToIntMap[First->getOperand(i)]); + } + + Value *IntRetVal = BoolToIntMap[U]; + Type *Int1Ty = Type::getInt1Ty(U->getContext()); + Instruction *I = cast<Instruction>(U.getUser()); + Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I); + U.set(BackToBool); + + return true; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } +}; +} + +char PPCBoolRetToInt::ID = 0; +INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int", + "Convert i1 constants to i32 if they are returned", + false, false) + +FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp new file mode 100644 index 0000000..73a5305 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -0,0 +1,237 @@ +//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that scans a machine function to determine which +// conditional branches need more than 16 bits of displacement to reach their +// target basic block. It does this in two passes; a calculation of basic block +// positions pass, and a branch pseudo op to machine branch opcode pass. This +// pass should be run last, just before the assembly printer. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-branch-select" + +STATISTIC(NumExpanded, "Number of branches expanded to long format"); + +namespace llvm { + void initializePPCBSelPass(PassRegistry&); +} + +namespace { + struct PPCBSel : public MachineFunctionPass { + static char ID; + PPCBSel() : MachineFunctionPass(ID) { + initializePPCBSelPass(*PassRegistry::getPassRegistry()); + } + + /// BlockSizes - The sizes of the basic blocks in the function. + std::vector<unsigned> BlockSizes; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "PowerPC Branch Selector"; + } + }; + char PPCBSel::ID = 0; +} + +INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector", + false, false) + +/// createPPCBranchSelectionPass - returns an instance of the Branch Selection +/// Pass +/// +FunctionPass *llvm::createPPCBranchSelectionPass() { + return new PPCBSel(); +} + +bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { + const PPCInstrInfo *TII = + static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo()); + // Give the blocks of the function a dense, in-order, numbering. + Fn.RenumberBlocks(); + BlockSizes.resize(Fn.getNumBlockIDs()); + + auto GetAlignmentAdjustment = + [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned { + unsigned Align = MBB.getAlignment(); + if (!Align) + return 0; + + unsigned AlignAmt = 1 << Align; + unsigned ParentAlign = MBB.getParent()->getAlignment(); + + if (Align <= ParentAlign) + return OffsetToAlignment(Offset, AlignAmt); + + // The alignment of this MBB is larger than the function's alignment, so we + // can't tell whether or not it will insert nops. Assume that it will. + return AlignAmt + OffsetToAlignment(Offset, AlignAmt); + }; + + // Measure each MBB and compute a size for the entire function. + unsigned FuncSize = 0; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock *MBB = &*MFI; + + // The end of the previous block may have extra nops if this block has an + // alignment requirement. + if (MBB->getNumber() > 0) { + unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize); + BlockSizes[MBB->getNumber()-1] += AlignExtra; + FuncSize += AlignExtra; + } + + unsigned BlockSize = 0; + for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); + MBBI != EE; ++MBBI) + BlockSize += TII->GetInstSizeInBytes(MBBI); + + BlockSizes[MBB->getNumber()] = BlockSize; + FuncSize += BlockSize; + } + + // If the entire function is smaller than the displacement of a branch field, + // we know we don't need to shrink any branches in this function. This is a + // common case. + if (FuncSize < (1 << 15)) { + BlockSizes.clear(); + return false; + } + + // For each conditional branch, if the offset to its destination is larger + // than the offset field allows, transform it into a long branch sequence + // like this: + // short branch: + // bCC MBB + // long branch: + // b!CC $PC+8 + // b MBB + // + bool MadeChange = true; + bool EverMadeChange = false; + while (MadeChange) { + // Iteratively expand branches until we reach a fixed point. + MadeChange = false; + + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + unsigned MBBStartOffset = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineBasicBlock *Dest = nullptr; + if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm()) + Dest = I->getOperand(2).getMBB(); + else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) && + !I->getOperand(1).isImm()) + Dest = I->getOperand(1).getMBB(); + else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ || + I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) && + !I->getOperand(0).isImm()) + Dest = I->getOperand(0).getMBB(); + + if (!Dest) { + MBBStartOffset += TII->GetInstSizeInBytes(I); + continue; + } + + // Determine the offset from the current branch to the destination + // block. + int BranchSize; + if (Dest->getNumber() <= MBB.getNumber()) { + // If this is a backwards branch, the delta is the offset from the + // start of this block to this branch, plus the sizes of all blocks + // from this block to the dest. + BranchSize = MBBStartOffset; + + for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } else { + // Otherwise, add the size of the blocks between this block and the + // dest to the number of bytes left in this block. + BranchSize = -MBBStartOffset; + + for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } + + // If this branch is in range, ignore it. + if (isInt<16>(BranchSize)) { + MBBStartOffset += 4; + continue; + } + + // Otherwise, we have to expand it to a long branch. + MachineInstr *OldBranch = I; + DebugLoc dl = OldBranch->getDebugLoc(); + + if (I->getOpcode() == PPC::BCC) { + // The BCC operands are: + // 0. PPC branch predicate + // 1. CR register + // 2. Target MBB + PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); + unsigned CRReg = I->getOperand(1).getReg(); + + // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. + BuildMI(MBB, I, dl, TII->get(PPC::BCC)) + .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); + } else if (I->getOpcode() == PPC::BC) { + unsigned CRBit = I->getOperand(0).getReg(); + BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2); + } else if (I->getOpcode() == PPC::BCn) { + unsigned CRBit = I->getOperand(0).getReg(); + BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2); + } else { + llvm_unreachable("Unhandled branch type!"); + } + + // Uncond branch to the real destination. + I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest); + + // Remove the old branch from the function. + OldBranch->eraseFromParent(); + + // Remember that this instruction is 8-bytes, increase the size of the + // block by 4, remember to iterate. + BlockSizes[MBB.getNumber()] += 4; + MBBStartOffset += 8; + ++NumExpanded; + MadeChange = true; + } + } + EverMadeChange |= MadeChange; + } + + BlockSizes.clear(); + return true; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp new file mode 100644 index 0000000..b6ac4d5 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -0,0 +1,697 @@ +//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies loops where we can generate the PPC branch instructions +// that decrement and test the count register (CTR) (bdnz and friends). +// +// The pattern that defines the induction variable can changed depending on +// prior optimizations. For example, the IndVarSimplify phase run by 'opt' +// normalizes induction variables, and the Loop Strength Reduction pass +// run by 'llc' may also make changes to the induction variable. +// +// Criteria for CTR loops: +// - Countable loops (w/ ind. var for a trip count) +// - Try inner-most loops first +// - No nested CTR loops. +// - No function calls in loops. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#ifndef NDEBUG +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#endif + +#include <algorithm> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "ctrloops" + +#ifndef NDEBUG +static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); +#endif + +STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); + +namespace llvm { + void initializePPCCTRLoopsPass(PassRegistry&); +#ifndef NDEBUG + void initializePPCCTRLoopsVerifyPass(PassRegistry&); +#endif +} + +namespace { + struct PPCCTRLoops : public FunctionPass { + +#ifndef NDEBUG + static int Counter; +#endif + + public: + static char ID; + + PPCCTRLoops() : FunctionPass(ID), TM(nullptr) { + initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); + } + PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + } + + private: + bool mightUseCTR(const Triple &TT, BasicBlock *BB); + bool convertToCTRLoop(Loop *L); + + private: + PPCTargetMachine *TM; + LoopInfo *LI; + ScalarEvolution *SE; + const DataLayout *DL; + DominatorTree *DT; + const TargetLibraryInfo *LibInfo; + bool PreserveLCSSA; + }; + + char PPCCTRLoops::ID = 0; +#ifndef NDEBUG + int PPCCTRLoops::Counter = 0; +#endif + +#ifndef NDEBUG + struct PPCCTRLoopsVerify : public MachineFunctionPass { + public: + static char ID; + + PPCCTRLoopsVerify() : MachineFunctionPass(ID) { + initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + MachineDominatorTree *MDT; + }; + + char PPCCTRLoopsVerify::ID = 0; +#endif // NDEBUG +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", + false, false) + +FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) { + return new PPCCTRLoops(TM); +} + +#ifndef NDEBUG +INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", + "PowerPC CTR Loops Verify", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", + "PowerPC CTR Loops Verify", false, false) + +FunctionPass *llvm::createPPCCTRLoopsVerify() { + return new PPCCTRLoopsVerify(); +} +#endif // NDEBUG + +bool PPCCTRLoops::runOnFunction(Function &F) { + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DL = &F.getParent()->getDataLayout(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + bool MadeChange = false; + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); + I != E; ++I) { + Loop *L = *I; + if (!L->getParentLoop()) + MadeChange |= convertToCTRLoop(L); + } + + return MadeChange; +} + +static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; +} + +// Determining the address of a TLS variable results in a function call in +// certain TLS models. +static bool memAddrUsesCTR(const PPCTargetMachine *TM, + const Value *MemAddr) { + const auto *GV = dyn_cast<GlobalValue>(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast<Constant>(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(TM, CO)) + return true; + + return false; + } + + if (!GV->isThreadLocal()) + return false; + if (!TM) + return true; + TLSModel::Model Model = TM->getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; +} + +bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast<CallInst>(J)) { + if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { + // Inline ASM is okay, unless it clobbers the ctr register. + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + + continue; + } + + if (!TM) + return true; + const TargetLowering *TLI = + TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr + // we're definitely using CTR. + case Intrinsic::ppc_is_decremented_ctr_nonzero: + case Intrinsic::ppc_mtctr: + return true; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc::Func Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc::copysign: + case LibFunc::copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc::copysignl: + return true; + case LibFunc::fabs: + case LibFunc::fabsf: + case LibFunc::fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc::sqrt: + case LibFunc::sqrtf: + case LibFunc::sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc::floor: + case LibFunc::floorf: + case LibFunc::floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc::nearbyint: + case LibFunc::nearbyintf: + case LibFunc::nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc::ceil: + case LibFunc::ceilf: + case LibFunc::ceill: + Opcode = ISD::FCEIL; break; + case LibFunc::rint: + case LibFunc::rintf: + case LibFunc::rintl: + Opcode = ISD::FRINT; break; + case LibFunc::round: + case LibFunc::roundf: + case LibFunc::roundl: + Opcode = ISD::FROUND; break; + case LibFunc::trunc: + case LibFunc::truncf: + case LibFunc::truncl: + Opcode = ISD::FTRUNC; break; + } + + auto &DL = CI->getModule()->getDataLayout(); + MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(), + true); + if (VTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, VTy)) + continue; + else if (VTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa<BinaryOperator>(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || + isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { + CastInst *CI = cast<CastInst>(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(TT.isArch32Bit(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (TT.isArch32Bit() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { + if (!TM) + return true; + const TargetLowering *TLI = + TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); + + if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) + return true; + } + for (Value *Operand : J->operands()) + if (memAddrUsesCTR(TM, Operand)) + return true; + } + + return false; +} + +bool PPCCTRLoops::convertToCTRLoop(Loop *L) { + bool MadeChange = false; + + const Triple TT = + Triple(L->getHeader()->getParent()->getParent()->getTargetTriple()); + if (!TT.isArch32Bit() && !TT.isArch64Bit()) + return MadeChange; // Unknown arch. type. + + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + MadeChange |= convertToCTRLoop(*I); + DEBUG(dbgs() << "Nested loop converted\n"); + } + + // If a nested loop has been converted, then we can't convert this loop. + if (MadeChange) + return MadeChange; + +#ifndef NDEBUG + // Stop trying after reaching the limit (if any). + int Limit = CTRLoopLimit; + if (Limit >= 0) { + if (Counter >= CTRLoopLimit) + return false; + Counter++; + } +#endif + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(TT, *I)) + return MadeChange; + + SmallVector<BasicBlock*, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + BasicBlock *CountedExitBlock = nullptr; + const SCEV *ExitCount = nullptr; + BranchInst *CountedExitBranch = nullptr; + for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), + IE = ExitingBlocks.end(); I != IE; ++I) { + const SCEV *EC = SE->getExitCount(L, *I); + DEBUG(dbgs() << "Exit Count for " << *L << " from block " << + (*I)->getName() << ": " << *EC << "\n"); + if (isa<SCEVCouldNotCompute>(EC)) + continue; + if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) { + if (ConstEC->getValue()->isZero()) + continue; + } else if (!SE->isLoopInvariant(EC, L)) + continue; + + if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32)) + continue; + + // We now have a loop-invariant count of loop iterations (which is not the + // constant zero) for which we know that this loop will not exit via this + // exisiting block. + + // We need to make sure that this block will run on every loop iteration. + // For this to be true, we must dominate all blocks with backedges. Such + // blocks are in-loop predecessors to the header block. + bool NotAlways = false; + for (pred_iterator PI = pred_begin(L->getHeader()), + PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { + if (!L->contains(*PI)) + continue; + + if (!DT->dominates(*I, *PI)) { + NotAlways = true; + break; + } + } + + if (NotAlways) + continue; + + // Make sure this blocks ends with a conditional branch. + Instruction *TI = (*I)->getTerminator(); + if (!TI) + continue; + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (!BI->isConditional()) + continue; + + CountedExitBranch = BI; + } else + continue; + + // Note that this block may not be the loop latch block, even if the loop + // has a latch block. + CountedExitBlock = *I; + ExitCount = EC; + break; + } + + if (!CountedExitBlock) + return MadeChange; + + BasicBlock *Preheader = L->getLoopPreheader(); + + // If we don't have a preheader, then insert one. If we already have a + // preheader, then we can use it (except if the preheader contains a use of + // the CTR register because some such uses might be reordered by the + // selection DAG after the mtctr instruction). + if (!Preheader || mightUseCTR(TT, Preheader)) + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + if (!Preheader) + return MadeChange; + + DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n"); + + // Insert the count into the preheader and replace the condition used by the + // selected branch. + MadeChange = true; + + SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt"); + LLVMContext &C = SE->getContext(); + Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) : + Type::getInt32Ty(C); + if (!ExitCount->getType()->isPointerTy() && + ExitCount->getType() != CountType) + ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); + ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); + Value *ECValue = + SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); + + IRBuilder<> CountBuilder(Preheader->getTerminator()); + Module *M = Preheader->getParent()->getParent(); + Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, + CountType); + CountBuilder.CreateCall(MTCTRFunc, ECValue); + + IRBuilder<> CondBuilder(CountedExitBranch); + Value *DecFunc = + Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); + Value *NewCond = CondBuilder.CreateCall(DecFunc, {}); + Value *OldCond = CountedExitBranch->getCondition(); + CountedExitBranch->setCondition(NewCond); + + // The false branch must exit the loop. + if (!L->contains(CountedExitBranch->getSuccessor(0))) + CountedExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + DeleteDeadPHIs(CountedExitBlock); + + ++NumCTRLoops; + return MadeChange; +} + +#ifndef NDEBUG +static bool clobbersCTR(const MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) + return true; + } else if (MO.isRegMask()) { + if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8)) + return true; + } + } + + return false; +} + +static bool verifyCTRBranch(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator BI = I; + SmallSet<MachineBasicBlock *, 16> Visited; + SmallVector<MachineBasicBlock *, 8> Preds; + bool CheckPreds; + + if (I == MBB->begin()) { + Visited.insert(MBB); + goto queue_preds; + } else + --I; + +check_block: + Visited.insert(MBB); + if (I == MBB->end()) + goto queue_preds; + + CheckPreds = true; + for (MachineBasicBlock::iterator IE = MBB->begin();; --I) { + unsigned Opc = I->getOpcode(); + if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) { + CheckPreds = false; + break; + } + + if (I != BI && clobbersCTR(I)) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" << + MBB->getFullName() << ") instruction " << *I << + " clobbers CTR, invalidating " << "BB#" << + BI->getParent()->getNumber() << " (" << + BI->getParent()->getFullName() << ") instruction " << + *BI << "\n"); + return false; + } + + if (I == IE) + break; + } + + if (!CheckPreds && Preds.empty()) + return true; + + if (CheckPreds) { +queue_preds: + if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) { + DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" << + BI->getParent()->getNumber() << " (" << + BI->getParent()->getFullName() << ") instruction " << + *BI << "\n"); + return false; + } + + for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), + PIE = MBB->pred_end(); PI != PIE; ++PI) + Preds.push_back(*PI); + } + + do { + MBB = Preds.pop_back_val(); + if (!Visited.count(MBB)) { + I = MBB->getLastNonDebugInstr(); + goto check_block; + } + } while (!Preds.empty()); + + return true; +} + +bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { + MDT = &getAnalysis<MachineDominatorTree>(); + + // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before + // any other instructions that might clobber the ctr register. + for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); + I != IE; ++I) { + MachineBasicBlock *MBB = &*I; + if (!MDT->isReachableFromEntry(MBB)) + continue; + + for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(), + MIIE = MBB->end(); MII != MIIE; ++MII) { + unsigned Opc = MII->getOpcode(); + if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ || + Opc == PPC::BDZ8 || Opc == PPC::BDZ) + if (!verifyCTRBranch(MBB, MII)) + llvm_unreachable("Invalid PPC CTR loop!"); + } + } + + return false; +} +#endif // NDEBUG diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h new file mode 100644 index 0000000..eb904a8 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h @@ -0,0 +1,35 @@ +//=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the PPC Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H +#define LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " \ + "stackmap and patchpoint intrinsics."); + // gracefully fallback to PPC C calling convention on Release builds. + return false; +} + +} // End llvm namespace + +#endif + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td new file mode 100644 index 0000000..5bc9124 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -0,0 +1,265 @@ +//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the PowerPC 32- and 64-bit +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("static_cast<const PPCSubtarget&>" + "(State.getMachineFunction().getSubtarget()).", + F), + A>; +class CCIfNotSubtarget<string F, CCAction A> + : CCIf<!strconcat("!static_cast<const PPCSubtarget&>" + "(State.getMachineFunction().getSubtarget()).", + F), + A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention +//===----------------------------------------------------------------------===// + +// PPC64 AnyReg return-value convention. No explicit register is specified for +// the return-value. The register allocator is allowed and expected to choose +// any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the PPC C calling convention. +def RetCC_PPC64_AnyReg : CallingConv<[ + CCCustom<"CC_PPC_AnyReg_Error"> +]>; + +// Return-value convention for PowerPC +def RetCC_PPC : CallingConv<[ + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>, + + // On PPC64, integer return values are always promoted to i64 + CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>, + CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>, + + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>, + CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + + // Floating point types returned as "direct" go into F1 .. F8; note that + // only the ELFv2 ABI fully utilizes all these registers. + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + + // QPX vectors are returned in QF1 and QF2. + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, + + // Vector types returned as "direct" go into V2 .. V9; note that only the + // ELFv2 ABI fully utilizes all these registers. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], + CCIfSubtarget<"hasAltivec()", + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, + CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()", + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>> +]>; + +// No explicit register is specified for the AnyReg calling convention. The +// register allocator may assign the arguments to any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the PPC C calling convention. +def CC_PPC64_AnyReg : CallingConv<[ + CCCustom<"CC_PPC_AnyReg_Error"> +]>; + +// Note that we don't currently have calling conventions for 64-bit +// PowerPC, but handle all the complexities of the ABI in the lowering +// logic. FIXME: See if the logic can be simplified with use of CCs. +// This may require some extensions to current table generation. + +// Simple calling convention for 64-bit ELF PowerPC fast isel. +// Only handle ints and floats. All ints are promoted to i64. +// Vector types and quadword ints are not handled. +def CC_PPC64_ELF_FIS : CallingConv<[ + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_PPC64_AnyReg>>, + + CCIfType<[i1], CCPromoteToType<i64>>, + CCIfType<[i8], CCPromoteToType<i64>>, + CCIfType<[i16], CCPromoteToType<i64>>, + CCIfType<[i32], CCPromoteToType<i64>>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>, + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>> +]>; + +// Simple return-value convention for 64-bit ELF PowerPC fast isel. +// All small ints are promoted to i64. Vector types, quadword ints, +// and multiple register returns are "supported" to avoid compile +// errors, but none are handled by the fast selector. +def RetCC_PPC64_ELF_FIS : CallingConv<[ + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>, + + CCIfType<[i1], CCPromoteToType<i64>>, + CCIfType<[i8], CCPromoteToType<i64>>, + CCIfType<[i16], CCPromoteToType<i64>>, + CCIfType<[i32], CCPromoteToType<i64>>, + CCIfType<[i64], CCAssignToReg<[X3, X4]>>, + CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], + CCIfSubtarget<"hasAltivec()", + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, + CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()", + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>> +]>; + +//===----------------------------------------------------------------------===// +// PowerPC System V Release 4 32-bit ABI +//===----------------------------------------------------------------------===// + +def CC_PPC32_SVR4_Common : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i32>>, + + // The ABI requires i64 to be passed in two adjacent registers with the first + // register having an odd register number. + CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>, + + // The 'nest' parameter, if any, is passed in R11. + CCIfNest<CCAssignToReg<[R11]>>, + + // The first 8 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, + + // Make sure the i64 words from a long double are either both passed in + // registers or both passed on the stack. + CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>, + + // FP values are passed in F1 - F8. + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + + // Split arguments have an alignment of 8 bytes on the stack. + CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // Floats are stored in double precision format, thus they have the same + // alignment and size as doubles. + CCIfType<[f32,f64], CCAssignToStack<8, 8>>, + + // QPX vectors that are stored in double precision need 32-byte alignment. + CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>> +]>; + +// This calling convention puts vector arguments always on the stack. It is used +// to assign vector arguments which belong to the variable portion of the +// parameter list of a variable argument function. +def CC_PPC32_SVR4_VarArg : CallingConv<[ + CCDelegateTo<CC_PPC32_SVR4_Common> +]>; + +// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to +// put vector arguments in vector registers before putting them on the stack. +def CC_PPC32_SVR4 : CallingConv<[ + // QPX vectors mirror the scalar FP convention. + CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", + CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, + + // The first 12 Vector arguments are passed in AltiVec registers. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, + V8, V9, V10, V11, V12, V13]>>>, + CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()", + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9, + VSH10, VSH11, VSH12, VSH13]>>>, + + CCDelegateTo<CC_PPC32_SVR4_Common> +]>; + +// Helper "calling convention" to handle aggregate by value arguments. +// Aggregate by value arguments are always placed in the local variable space +// of the caller. This calling convention is only used to assign those stack +// offsets in the callers stack frame. +// +// Still, the address of the aggregate copy in the callers stack frame is passed +// in a GPR (or in the parameter list area if all GPRs are allocated) from the +// caller to the callee. The location for the address argument is assigned by +// the CC_PPC32_SVR4 calling convention. +// +// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are +// not passed by value. + +def CC_PPC32_SVR4_ByVal : CallingConv<[ + CCIfByVal<CCPassByVal<4, 4>>, + + CCCustom<"CC_PPC32_SVR4_Custom_Dummy"> +]>; + +def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27, + V28, V29, V30, V31)>; + +def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, + R29, R30, R31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>; + +def CSR_SVR432 : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, + R29, R30, R31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>; + +def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20, + X21, X22, X23, X24, X25, X26, X27, X28, + X29, X30, X31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>; + +def CSR_SVR464 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, + X21, X22, X23, X24, X25, X26, X27, X28, + X29, X30, X31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>; + +def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>; + +def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>; + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10), + (sequence "X%u", 14, 31), + (sequence "F%u", 0, 31), + (sequence "CR%u", 0, 7))>; + +def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs, + (sequence "V%u", 0, 31))>; + +def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec, + (sequence "VSL%u", 0, 31), + (sequence "VSH%u", 0, 31))>; + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp new file mode 100644 index 0000000..7cb1bb5 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -0,0 +1,207 @@ +//===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A pass that form early (predicated) returns. If-conversion handles some of +// this, but this pass picks up some remaining cases. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-early-ret" +STATISTIC(NumBCLR, "Number of early conditional returns"); +STATISTIC(NumBLR, "Number of early returns"); + +namespace llvm { + void initializePPCEarlyReturnPass(PassRegistry&); +} + +namespace { + // PPCEarlyReturn pass - For simple functions without epilogue code, move + // returns up, and create conditional returns, to avoid unnecessary + // branch-to-blr sequences. + struct PPCEarlyReturn : public MachineFunctionPass { + static char ID; + PPCEarlyReturn() : MachineFunctionPass(ID) { + initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry()); + } + + const TargetInstrInfo *TII; + +protected: + bool processBlock(MachineBasicBlock &ReturnMBB) { + bool Changed = false; + + MachineBasicBlock::iterator I = ReturnMBB.begin(); + I = ReturnMBB.SkipPHIsAndLabels(I); + + // The block must be essentially empty except for the blr. + if (I == ReturnMBB.end() || + (I->getOpcode() != PPC::BLR && I->getOpcode() != PPC::BLR8) || + I != ReturnMBB.getLastNonDebugInstr()) + return Changed; + + SmallVector<MachineBasicBlock*, 8> PredToRemove; + for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(), + PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) { + bool OtherReference = false, BlockChanged = false; + + if ((*PI)->empty()) + continue; + + for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) { + if (J == (*PI)->end()) + break; + + if (J->getOpcode() == PPC::B) { + if (J->getOperand(0).getMBB() == &ReturnMBB) { + // This is an unconditional branch to the return. Replace the + // branch with a blr. + BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())) + .copyImplicitOps(I); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBLR; + continue; + } + } else if (J->getOpcode() == PPC::BCC) { + if (J->getOperand(2).getMBB() == &ReturnMBB) { + // This is a conditional branch to the return. Replace the branch + // with a bclr. + BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) + .addImm(J->getOperand(0).getImm()) + .addReg(J->getOperand(1).getReg()) + .copyImplicitOps(I); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBCLR; + continue; + } + } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) { + if (J->getOperand(1).getMBB() == &ReturnMBB) { + // This is a conditional branch to the return. Replace the branch + // with a bclr. + BuildMI( + **PI, J, J->getDebugLoc(), + TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn)) + .addReg(J->getOperand(0).getReg()) + .copyImplicitOps(I); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBCLR; + continue; + } + } else if (J->isBranch()) { + if (J->isIndirectBranch()) { + if (ReturnMBB.hasAddressTaken()) + OtherReference = true; + } else + for (unsigned i = 0; i < J->getNumOperands(); ++i) + if (J->getOperand(i).isMBB() && + J->getOperand(i).getMBB() == &ReturnMBB) + OtherReference = true; + } else if (!J->isTerminator() && !J->isDebugValue()) + break; + + if (J == (*PI)->begin()) + break; + + --J; + } + + if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB)) + OtherReference = true; + + // Predecessors are stored in a vector and can't be removed here. + if (!OtherReference && BlockChanged) { + PredToRemove.push_back(*PI); + } + + if (BlockChanged) + Changed = true; + } + + for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i) + PredToRemove[i]->removeSuccessor(&ReturnMBB, true); + + if (Changed && !ReturnMBB.hasAddressTaken()) { + // We now might be able to merge this blr-only block into its + // by-layout predecessor. + if (ReturnMBB.pred_size() == 1) { + MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin(); + if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) { + // Move the blr into the preceding block. + PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I); + PrevMBB.removeSuccessor(&ReturnMBB, true); + } + } + + if (ReturnMBB.pred_empty()) + ReturnMBB.eraseFromParent(); + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TII = MF.getSubtarget().getInstrInfo(); + + bool Changed = false; + + // If the function does not have at least two blocks, then there is + // nothing to do. + if (MF.size() < 2) + return Changed; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE, + "PowerPC Early-Return Creation", false, false) + +char PPCEarlyReturn::ID = 0; +FunctionPass* +llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp new file mode 100644 index 0000000..b451ebf --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -0,0 +1,2344 @@ +//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// PPCGenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCCallingConv.h" +#include "PPCISelLowering.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" + +//===----------------------------------------------------------------------===// +// +// TBD: +// fastLowerArguments: Handle simple cases. +// PPCMaterializeGV: Handle TLS. +// SelectCall: Handle function pointers. +// SelectCall: Handle multi-register return values. +// SelectCall: Optimize away nops for local calls. +// processCallArgs: Handle bit-converted arguments. +// finishCall: Handle multi-register return values. +// PPCComputeAddress: Handle parameter references as FrameIndex's. +// PPCEmitCmp: Handle immediate as operand 1. +// SelectCall: Handle small byval arguments. +// SelectIntrinsicCall: Implement. +// SelectSelect: Implement. +// Consider factoring isTypeLegal into the base class. +// Implement switches and jump tables. +// +//===----------------------------------------------------------------------===// +using namespace llvm; + +#define DEBUG_TYPE "ppcfastisel" + +namespace { + +typedef struct Address { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FI; + } Base; + + long Offset; + + // Innocuous defaults for our address. + Address() + : BaseType(RegBase), Offset(0) { + Base.Reg = 0; + } +} Address; + +class PPCFastISel final : public FastISel { + + const TargetMachine &TM; + const PPCSubtarget *PPCSubTarget; + PPCFunctionInfo *PPCFuncInfo; + const TargetInstrInfo &TII; + const TargetLowering &TLI; + LLVMContext *Context; + + public: + explicit PPCFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()), + PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()), + PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()), + TII(*PPCSubTarget->getInstrInfo()), + TLI(*PPCSubTarget->getTargetLowering()), + Context(&FuncInfo.Fn->getContext()) {} + + // Backend specific FastISel code. + private: + bool fastSelectInstruction(const Instruction *I) override; + unsigned fastMaterializeConstant(const Constant *C) override; + unsigned fastMaterializeAlloca(const AllocaInst *AI) override; + bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) override; + bool fastLowerArguments() override; + unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override; + unsigned fastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm); + unsigned fastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill); + unsigned fastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + + bool fastLowerCall(CallLoweringInfo &CLI) override; + + // Instruction selection routines. + private: + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectIndirectBr(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectIToFP(const Instruction *I, bool IsSigned); + bool SelectFPToI(const Instruction *I, bool IsSigned); + bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode); + bool SelectRet(const Instruction *I); + bool SelectTrunc(const Instruction *I); + bool SelectIntExt(const Instruction *I); + + // Utility routines. + private: + bool isTypeLegal(Type *Ty, MVT &VT); + bool isLoadTypeLegal(Type *Ty, MVT &VT); + bool isValueAvailable(const Value *V) const; + bool isVSFRCRegister(unsigned Register) const { + return MRI.getRegClass(Register)->getID() == PPC::VSFRCRegClassID; + } + bool isVSSRCRegister(unsigned Register) const { + return MRI.getRegClass(Register)->getID() == PPC::VSSRCRegClassID; + } + bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value, + bool isZExt, unsigned DestReg); + bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + const TargetRegisterClass *RC, bool IsZExt = true, + unsigned FP64LoadOpc = PPC::LFD); + bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr); + bool PPCComputeAddress(const Value *Obj, Address &Addr); + void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset, + unsigned &IndexReg); + bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + unsigned DestReg, bool IsZExt); + unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT); + unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt = true); + unsigned PPCMaterialize32BitInt(int64_t Imm, + const TargetRegisterClass *RC); + unsigned PPCMaterialize64BitInt(int64_t Imm, + const TargetRegisterClass *RC); + unsigned PPCMoveToIntReg(const Instruction *I, MVT VT, + unsigned SrcReg, bool IsSigned); + unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned); + + // Call handling routines. + private: + bool processCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg); + bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes); + CCAssignFn *usePPC32CCs(unsigned Flag); + + private: + #include "PPCGenFastISel.inc" + +}; + +} // end anonymous namespace + +#include "PPCGenCallingConv.inc" + +// Function whose sole purpose is to kill compiler warnings +// stemming from unused functions included from PPCGenCallingConv.inc. +CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) { + if (Flag == 1) + return CC_PPC32_SVR4; + else if (Flag == 2) + return CC_PPC32_SVR4_ByVal; + else if (Flag == 3) + return CC_PPC32_SVR4_VarArg; + else + return RetCC_PPC; +} + +static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // These are not representable with any single compare. + case CmpInst::FCMP_FALSE: + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_UGT: + case CmpInst::FCMP_UGE: + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + case CmpInst::FCMP_UNE: + case CmpInst::FCMP_TRUE: + default: + return Optional<PPC::Predicate>(); + + case CmpInst::FCMP_OEQ: + case CmpInst::ICMP_EQ: + return PPC::PRED_EQ; + + case CmpInst::FCMP_OGT: + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + return PPC::PRED_GT; + + case CmpInst::FCMP_OGE: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + return PPC::PRED_GE; + + case CmpInst::FCMP_OLT: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_SLT: + return PPC::PRED_LT; + + case CmpInst::FCMP_OLE: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + return PPC::PRED_LE; + + case CmpInst::FCMP_ONE: + case CmpInst::ICMP_NE: + return PPC::PRED_NE; + + case CmpInst::FCMP_ORD: + return PPC::PRED_NU; + + case CmpInst::FCMP_UNO: + return PPC::PRED_UN; + } +} + +// Determine whether the type Ty is simple enough to be handled by +// fast-isel, and return its equivalent machine type in VT. +// FIXME: Copied directly from ARM -- factor into base class? +bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT Evt = TLI.getValueType(DL, Ty, true); + + // Only handle simple types. + if (Evt == MVT::Other || !Evt.isSimple()) return false; + VT = Evt.getSimpleVT(); + + // Handle all legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +// Determine whether the type Ty is simple enough to be handled by +// fast-isel as a load target, and return its equivalent machine type in VT. +bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { + if (isTypeLegal(Ty, VT)) return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) { + return true; + } + + return false; +} + +bool PPCFastISel::isValueAvailable(const Value *V) const { + if (!isa<Instruction>(V)) + return true; + + const auto *I = cast<Instruction>(V); + return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB; +} + +// Given a value Obj, create an Address object Addr that represents its +// address. Return false if we can't handle it. +bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: + break; + case Instruction::BitCast: + // Look through bitcasts. + return PPCComputeAddress(U->getOperand(0), Addr); + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return PPCComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return PPCComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + long TmpOffset = Addr.Offset; + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end(); + II != IE; ++II, ++GTI) { + const Value *Op = *II; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.Offset = TmpOffset; + if (PPCComputeAddress(U->getOperand(0), Addr)) return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = SI->second; + return true; + } + break; + } + } + + // FIXME: References to parameters fall through to the behavior + // below. They should be able to reference a frame index since + // they are stored to the stack, so we can get "ld rx, offset(r1)" + // instead of "addi ry, r1, offset / ld rx, 0(ry)". Obj will + // just contain the parameter. Try to handle this with a FI. + + // Try to get this in a register if nothing else has worked. + if (Addr.Base.Reg == 0) + Addr.Base.Reg = getRegForValue(Obj); + + // Prevent assignment of base register to X0, which is inappropriate + // for loads and stores alike. + if (Addr.Base.Reg != 0) + MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass); + + return Addr.Base.Reg != 0; +} + +// Fix up some addresses that can't be used directly. For example, if +// an offset won't fit in an instruction field, we may need to move it +// into an index register. +void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset, + unsigned &IndexReg) { + + // Check whether the offset fits in the instruction field. + if (!isInt<16>(Addr.Offset)) + UseOffset = false; + + // If this is a stack pointer and the offset needs to be simplified then + // put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) { + unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), + ResultReg).addFrameIndex(Addr.Base.FI).addImm(0); + Addr.Base.Reg = ResultReg; + Addr.BaseType = Address::RegBase; + } + + if (!UseOffset) { + IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context) + : Type::getInt64Ty(*Context)); + const ConstantInt *Offset = + ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset)); + IndexReg = PPCMaterializeInt(Offset, MVT::i64); + assert(IndexReg && "Unexpected error in PPCMaterializeInt!"); + } +} + +// Emit a load instruction if possible, returning true if we succeeded, +// otherwise false. See commentary below for how the register class of +// the load is determined. +bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + const TargetRegisterClass *RC, + bool IsZExt, unsigned FP64LoadOpc) { + unsigned Opc; + bool UseOffset = true; + + // If ResultReg is given, it determines the register class of the load. + // Otherwise, RC is the register class to use. If the result of the + // load isn't anticipated in this block, both may be zero, in which + // case we must make a conservative guess. In particular, don't assign + // R0 or X0 to the result register, as the result may be used in a load, + // store, add-immediate, or isel that won't permit this. (Though + // perhaps the spill and reload of live-exit values would handle this?) + const TargetRegisterClass *UseRC = + (ResultReg ? MRI.getRegClass(ResultReg) : + (RC ? RC : + (VT == MVT::f64 ? &PPC::F8RCRegClass : + (VT == MVT::f32 ? &PPC::F4RCRegClass : + (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : + &PPC::GPRC_and_GPRC_NOR0RegClass))))); + + bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass); + + switch (VT.SimpleTy) { + default: // e.g., vector types not handled + return false; + case MVT::i8: + Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8; + break; + case MVT::i16: + Opc = (IsZExt ? + (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : + (Is32BitInt ? PPC::LHA : PPC::LHA8)); + break; + case MVT::i32: + Opc = (IsZExt ? + (Is32BitInt ? PPC::LWZ : PPC::LWZ8) : + (Is32BitInt ? PPC::LWA_32 : PPC::LWA)); + if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0)) + UseOffset = false; + break; + case MVT::i64: + Opc = PPC::LD; + assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && + "64-bit load with 32-bit target??"); + UseOffset = ((Addr.Offset & 3) == 0); + break; + case MVT::f32: + Opc = PPC::LFS; + break; + case MVT::f64: + Opc = FP64LoadOpc; + break; + } + + // If necessary, materialize the offset into a register and use + // the indexed form. Also handle stack pointers with special needs. + unsigned IndexReg = 0; + PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg); + + // If this is a potential VSX load with an offset of 0, a VSX indexed load can + // be used. + bool IsVSSRC = (ResultReg != 0) && isVSSRCRegister(ResultReg); + bool IsVSFRC = (ResultReg != 0) && isVSFRCRegister(ResultReg); + bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS; + bool Is64VSXLoad = IsVSSRC && Opc == PPC::LFD; + if ((Is32VSXLoad || Is64VSXLoad) && + (Addr.BaseType != Address::FrameIndexBase) && UseOffset && + (Addr.Offset == 0)) { + UseOffset = false; + } + + if (ResultReg == 0) + ResultReg = createResultReg(UseRC); + + // Note: If we still have a frame index here, we know the offset is + // in range, as otherwise PPCSimplifyAddress would have converted it + // into a RegBase. + if (Addr.BaseType == Address::FrameIndexBase) { + // VSX only provides an indexed load. + if (Is32VSXLoad || Is64VSXLoad) return false; + + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), + MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI), + MFI.getObjectAlignment(Addr.Base.FI)); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO); + + // Base reg with offset in range. + } else if (UseOffset) { + // VSX only provides an indexed load. + if (Is32VSXLoad || Is64VSXLoad) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addImm(Addr.Offset).addReg(Addr.Base.Reg); + + // Indexed form. + } else { + // Get the RR opcode corresponding to the RI one. FIXME: It would be + // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it + // is hard to get at. + switch (Opc) { + default: llvm_unreachable("Unexpected opcode!"); + case PPC::LBZ: Opc = PPC::LBZX; break; + case PPC::LBZ8: Opc = PPC::LBZX8; break; + case PPC::LHZ: Opc = PPC::LHZX; break; + case PPC::LHZ8: Opc = PPC::LHZX8; break; + case PPC::LHA: Opc = PPC::LHAX; break; + case PPC::LHA8: Opc = PPC::LHAX8; break; + case PPC::LWZ: Opc = PPC::LWZX; break; + case PPC::LWZ8: Opc = PPC::LWZX8; break; + case PPC::LWA: Opc = PPC::LWAX; break; + case PPC::LWA_32: Opc = PPC::LWAX_32; break; + case PPC::LD: Opc = PPC::LDX; break; + case PPC::LFS: Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break; + case PPC::LFD: Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Addr.Base.Reg).addReg(IndexReg); + } + + return true; +} + +// Attempt to fast-select a load instruction. +bool PPCFastISel::SelectLoad(const Instruction *I) { + // FIXME: No atomic loads are supported. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(I->getOperand(0), Addr)) + return false; + + // Look at the currently assigned register for this instruction + // to determine the required register class. This is necessary + // to constrain RA from using R0/X0 when this is not legal. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; + + unsigned ResultReg = 0; + if (!PPCEmitLoad(VT, ResultReg, Addr, RC)) + return false; + updateValueMap(I, ResultReg); + return true; +} + +// Emit a store instruction to store SrcReg at Addr. +bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) { + assert(SrcReg && "Nothing to store!"); + unsigned Opc; + bool UseOffset = true; + + const TargetRegisterClass *RC = MRI.getRegClass(SrcReg); + bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + switch (VT.SimpleTy) { + default: // e.g., vector types not handled + return false; + case MVT::i8: + Opc = Is32BitInt ? PPC::STB : PPC::STB8; + break; + case MVT::i16: + Opc = Is32BitInt ? PPC::STH : PPC::STH8; + break; + case MVT::i32: + assert(Is32BitInt && "Not GPRC for i32??"); + Opc = PPC::STW; + break; + case MVT::i64: + Opc = PPC::STD; + UseOffset = ((Addr.Offset & 3) == 0); + break; + case MVT::f32: + Opc = PPC::STFS; + break; + case MVT::f64: + Opc = PPC::STFD; + break; + } + + // If necessary, materialize the offset into a register and use + // the indexed form. Also handle stack pointers with special needs. + unsigned IndexReg = 0; + PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg); + + // If this is a potential VSX store with an offset of 0, a VSX indexed store + // can be used. + bool IsVSSRC = isVSSRCRegister(SrcReg); + bool IsVSFRC = isVSFRCRegister(SrcReg); + bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS; + bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD; + if ((Is32VSXStore || Is64VSXStore) && + (Addr.BaseType != Address::FrameIndexBase) && UseOffset && + (Addr.Offset == 0)) { + UseOffset = false; + } + + // Note: If we still have a frame index here, we know the offset is + // in range, as otherwise PPCSimplifyAddress would have converted it + // into a RegBase. + if (Addr.BaseType == Address::FrameIndexBase) { + // VSX only provides an indexed store. + if (Is32VSXStore || Is64VSXStore) return false; + + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), + MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI), + MFI.getObjectAlignment(Addr.Base.FI)); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg) + .addImm(Addr.Offset) + .addFrameIndex(Addr.Base.FI) + .addMemOperand(MMO); + + // Base reg with offset in range. + } else if (UseOffset) { + // VSX only provides an indexed store. + if (Is32VSXStore || Is64VSXStore) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg); + + // Indexed form. + } else { + // Get the RR opcode corresponding to the RI one. FIXME: It would be + // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it + // is hard to get at. + switch (Opc) { + default: llvm_unreachable("Unexpected opcode!"); + case PPC::STB: Opc = PPC::STBX; break; + case PPC::STH : Opc = PPC::STHX; break; + case PPC::STW : Opc = PPC::STWX; break; + case PPC::STB8: Opc = PPC::STBX8; break; + case PPC::STH8: Opc = PPC::STHX8; break; + case PPC::STW8: Opc = PPC::STWX8; break; + case PPC::STD: Opc = PPC::STDX; break; + case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break; + case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break; + } + + auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg); + + // If we have an index register defined we use it in the store inst, + // otherwise we use X0 as base as it makes the vector instructions to + // use zero in the computation of the effective address regardless the + // content of the register. + if (IndexReg) + MIB.addReg(Addr.Base.Reg).addReg(IndexReg); + else + MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg); + } + + return true; +} + +// Attempt to fast-select a store instruction. +bool PPCFastISel::SelectStore(const Instruction *I) { + Value *Op0 = I->getOperand(0); + unsigned SrcReg = 0; + + // FIXME: No atomics loads are supported. + if (cast<StoreInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(Op0->getType(), VT)) + return false; + + // Get the value to be stored into a register. + SrcReg = getRegForValue(Op0); + if (SrcReg == 0) + return false; + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(I->getOperand(1), Addr)) + return false; + + if (!PPCEmitStore(VT, SrcReg, Addr)) + return false; + + return true; +} + +// Attempt to fast-select a branch instruction. +bool PPCFastISel::SelectBranch(const Instruction *I) { + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *BrBB = FuncInfo.MBB; + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // For now, just try the simplest case where it's fed by a compare. + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (isValueAvailable(CI)) { + Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate()); + if (!OptPPCPred) + return false; + + PPC::Predicate PPCPred = OptPPCPred.getValue(); + + // Take advantage of fall-through opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + PPCPred = PPC::InvertPredicate(PPCPred); + } + + unsigned CondReg = createResultReg(&PPC::CRRCRegClass); + + if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), + CondReg)) + return false; + + BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC)) + .addImm(PPCPred).addReg(CondReg).addMBB(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } + } else if (const ConstantInt *CI = + dyn_cast<ConstantInt>(BI->getCondition())) { + uint64_t Imm = CI->getZExtValue(); + MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; + fastEmitBranch(Target, DbgLoc); + return true; + } + + // FIXME: ARM looks for a case where the block containing the compare + // has been split from the block containing the branch. If this happens, + // there is a vreg available containing the result of the compare. I'm + // not sure we can do much, as we've lost the predicate information with + // the compare instruction -- we have a 4-bit CR but don't know which bit + // to test here. + return false; +} + +// Attempt to emit a compare of the two source values. Signed and unsigned +// comparisons are supported. Return false if we can't handle it. +bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, + bool IsZExt, unsigned DestReg) { + Type *Ty = SrcValue1->getType(); + EVT SrcEVT = TLI.getValueType(DL, Ty, true); + if (!SrcEVT.isSimple()) + return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + + if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits()) + return false; + + // See if operand 2 is an immediate encodeable in the compare. + // FIXME: Operands are not in canonical order at -O0, so an immediate + // operand in position 1 is a lost opportunity for now. We are + // similar to ARM in this regard. + long Imm = 0; + bool UseImm = false; + + // Only 16-bit integer constants can be represented in compares for + // PowerPC. Others will be materialized into a register. + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) { + if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || + SrcVT == MVT::i8 || SrcVT == MVT::i1) { + const APInt &CIVal = ConstInt->getValue(); + Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue(); + if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm))) + UseImm = true; + } + } + + unsigned CmpOpc; + bool NeedsExt = false; + switch (SrcVT.SimpleTy) { + default: return false; + case MVT::f32: + CmpOpc = PPC::FCMPUS; + break; + case MVT::f64: + CmpOpc = PPC::FCMPUD; + break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + NeedsExt = true; + // Intentional fall-through. + case MVT::i32: + if (!UseImm) + CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW; + else + CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI; + break; + case MVT::i64: + if (!UseImm) + CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD; + else + CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI; + break; + } + + unsigned SrcReg1 = getRegForValue(SrcValue1); + if (SrcReg1 == 0) + return false; + + unsigned SrcReg2 = 0; + if (!UseImm) { + SrcReg2 = getRegForValue(SrcValue2); + if (SrcReg2 == 0) + return false; + } + + if (NeedsExt) { + unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt)) + return false; + SrcReg1 = ExtReg; + + if (!UseImm) { + unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt)) + return false; + SrcReg2 = ExtReg; + } + } + + if (!UseImm) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg) + .addReg(SrcReg1).addReg(SrcReg2); + else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg) + .addReg(SrcReg1).addImm(Imm); + + return true; +} + +// Attempt to fast-select a floating-point extend instruction. +bool PPCFastISel::SelectFPExt(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + + if (SrcVT != MVT::f32 || DestVT != MVT::f64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // No code is generated for a FP extend. + updateValueMap(I, SrcReg); + return true; +} + +// Attempt to fast-select a floating-point truncate instruction. +bool PPCFastISel::SelectFPTrunc(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + + if (SrcVT != MVT::f64 || DestVT != MVT::f32) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // Round the result to single precision. + unsigned DestReg = createResultReg(&PPC::F4RCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg) + .addReg(SrcReg); + + updateValueMap(I, DestReg); + return true; +} + +// Move an i32 or i64 value in a GPR to an f64 value in an FPR. +// FIXME: When direct register moves are implemented (see PowerISA 2.07), +// those should be used instead of moving via a stack slot when the +// subtarget permits. +// FIXME: The code here is sloppy for the 4-byte case. Can use a 4-byte +// stack slot and 4-byte store/load sequence. Or just sext the 4-byte +// case to 8 bytes which produces tighter code but wastes stack space. +unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg, + bool IsSigned) { + + // If necessary, extend 32-bit int to 64-bit. + if (SrcVT == MVT::i32) { + unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned)) + return 0; + SrcReg = TmpReg; + } + + // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary. + Address Addr; + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = MFI.CreateStackObject(8, 8, false); + + // Store the value from the GPR. + if (!PPCEmitStore(MVT::i64, SrcReg, Addr)) + return 0; + + // Load the integer value into an FPR. The kind of load used depends + // on a number of conditions. + unsigned LoadOpc = PPC::LFD; + + if (SrcVT == MVT::i32) { + if (!IsSigned) { + LoadOpc = PPC::LFIWZX; + Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4; + } else if (PPCSubTarget->hasLFIWAX()) { + LoadOpc = PPC::LFIWAX; + Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4; + } + } + + const TargetRegisterClass *RC = &PPC::F8RCRegClass; + unsigned ResultReg = 0; + if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc)) + return 0; + + return ResultReg; +} + +// Attempt to fast-select an integer-to-floating-point conversion. +// FIXME: Once fast-isel has better support for VSX, conversions using +// direct moves should be implemented. +bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { + MVT DstVT; + Type *DstTy = I->getType(); + if (!isTypeLegal(DstTy, DstVT)) + return false; + + if (DstVT != MVT::f32 && DstVT != MVT::f64) + return false; + + Value *Src = I->getOperand(0); + EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true); + if (!SrcEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + + if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && + SrcVT != MVT::i32 && SrcVT != MVT::i64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (SrcReg == 0) + return false; + + // We can only lower an unsigned convert if we have the newer + // floating-point conversion operations. + if (!IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + + // FIXME: For now we require the newer floating-point conversion operations + // (which are present only on P7 and A2 server models) when converting + // to single-precision float. Otherwise we have to generate a lot of + // fiddly code to avoid double rounding. If necessary, the fiddly code + // can be found in PPCTargetLowering::LowerINT_TO_FP(). + if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT()) + return false; + + // Extend the input if necessary. + if (SrcVT == MVT::i8 || SrcVT == MVT::i16) { + unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned)) + return false; + SrcVT = MVT::i64; + SrcReg = TmpReg; + } + + // Move the integer value to an FPR. + unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned); + if (FPReg == 0) + return false; + + // Determine the opcode for the conversion. + const TargetRegisterClass *RC = &PPC::F8RCRegClass; + unsigned DestReg = createResultReg(RC); + unsigned Opc; + + if (DstVT == MVT::f32) + Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS; + else + Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU; + + // Generate the convert. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(FPReg); + + updateValueMap(I, DestReg); + return true; +} + +// Move the floating-point value in SrcReg into an integer destination +// register, and return the register (or zero if we can't handle it). +// FIXME: When direct register moves are implemented (see PowerISA 2.07), +// those should be used instead of moving via a stack slot when the +// subtarget permits. +unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT, + unsigned SrcReg, bool IsSigned) { + // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary. + // Note that if have STFIWX available, we could use a 4-byte stack + // slot for i32, but this being fast-isel we'll just go with the + // easiest code gen possible. + Address Addr; + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = MFI.CreateStackObject(8, 8, false); + + // Store the value from the FPR. + if (!PPCEmitStore(MVT::f64, SrcReg, Addr)) + return 0; + + // Reload it into a GPR. If we want an i32, modify the address + // to have a 4-byte offset so we load from the right place. + if (VT == MVT::i32) + Addr.Offset = 4; + + // Look at the currently assigned register for this instruction + // to determine the required register class. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; + + unsigned ResultReg = 0; + if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned)) + return 0; + + return ResultReg; +} + +// Attempt to fast-select a floating-point-to-integer conversion. +// FIXME: Once fast-isel has better support for VSX, conversions using +// direct moves should be implemented. +bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { + MVT DstVT, SrcVT; + Type *DstTy = I->getType(); + if (!isTypeLegal(DstTy, DstVT)) + return false; + + if (DstVT != MVT::i32 && DstVT != MVT::i64) + return false; + + // If we don't have FCTIDUZ and we need it, punt to SelectionDAG. + if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + if (!isTypeLegal(SrcTy, SrcVT)) + return false; + + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (SrcReg == 0) + return false; + + // Convert f32 to f64 if necessary. This is just a meaningless copy + // to get the register class right. COPY_TO_REGCLASS is needed since + // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream. + const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg); + if (InRC == &PPC::F4RCRegClass) { + unsigned TmpReg = createResultReg(&PPC::F8RCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg) + .addReg(SrcReg).addImm(PPC::F8RCRegClassID); + SrcReg = TmpReg; + } + + // Determine the opcode for the conversion, which takes place + // entirely within FPRs. + unsigned DestReg = createResultReg(&PPC::F8RCRegClass); + unsigned Opc; + + if (DstVT == MVT::i32) + if (IsSigned) + Opc = PPC::FCTIWZ; + else + Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ; + else + Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ; + + // Generate the convert. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(SrcReg); + + // Now move the integer value from a float register to an integer register. + unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned); + if (IntReg == 0) + return false; + + updateValueMap(I, IntReg); + return true; +} + +// Attempt to fast-select a binary integer operation that isn't already +// handled automatically. +bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + + // We can get here in the case when we have a binary operation on a non-legal + // type and the target independent selector doesn't know how to handle it. + if (DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + // Look at the currently assigned register for this instruction + // to determine the required register class. If there is no register, + // make a conservative choice (don't assign R0). + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + (AssignedReg ? MRI.getRegClass(AssignedReg) : + &PPC::GPRC_and_GPRC_NOR0RegClass); + bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + unsigned Opc; + switch (ISDOpcode) { + default: return false; + case ISD::ADD: + Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8; + break; + case ISD::OR: + Opc = IsGPRC ? PPC::OR : PPC::OR8; + break; + case ISD::SUB: + Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8; + break; + } + + unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass); + unsigned SrcReg1 = getRegForValue(I->getOperand(0)); + if (SrcReg1 == 0) return false; + + // Handle case of small immediate operand. + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) { + const APInt &CIVal = ConstInt->getValue(); + int Imm = (int)CIVal.getSExtValue(); + bool UseImm = true; + if (isInt<16>(Imm)) { + switch (Opc) { + default: + llvm_unreachable("Missing case!"); + case PPC::ADD4: + Opc = PPC::ADDI; + MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass); + break; + case PPC::ADD8: + Opc = PPC::ADDI8; + MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass); + break; + case PPC::OR: + Opc = PPC::ORI; + break; + case PPC::OR8: + Opc = PPC::ORI8; + break; + case PPC::SUBF: + if (Imm == -32768) + UseImm = false; + else { + Opc = PPC::ADDI; + MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass); + Imm = -Imm; + } + break; + case PPC::SUBF8: + if (Imm == -32768) + UseImm = false; + else { + Opc = PPC::ADDI8; + MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass); + Imm = -Imm; + } + break; + } + + if (UseImm) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg) + .addReg(SrcReg1) + .addImm(Imm); + updateValueMap(I, ResultReg); + return true; + } + } + } + + // Reg-reg case. + unsigned SrcReg2 = getRegForValue(I->getOperand(1)); + if (SrcReg2 == 0) return false; + + // Reverse operands for subtract-from. + if (ISDOpcode == ISD::SUB) + std::swap(SrcReg1, SrcReg2); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg1).addReg(SrcReg2); + updateValueMap(I, ResultReg); + return true; +} + +// Handle arguments to a call that we're attempting to fast-select. +// Return false if the arguments are too complex for us at the moment. +bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg) { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context); + + // Reserve space for the linkage area on the stack. + unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize(); + CCInfo.AllocateStack(LinkageSize, 8); + + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); + + // Bail out if we can't handle any of the arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Skip vector arguments for now, as well as long double and + // uint128_t, and anything that isn't passed in a register. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 || + !VA.isRegLoc() || VA.needsCustom()) + return false; + + // Skip bit-converted arguments for now. + if (VA.getLocInfo() == CCValAssign::BCvt) + return false; + } + + // Get a count of how many bytes are to be pushed onto the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 64); + + // Issue CALLSEQ_START. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TII.getCallFrameSetupOpcode())) + .addImm(NumBytes); + + // Prepare to assign register arguments. Every argument uses up a + // GPR protocol register even if it's passed in a floating-point + // register (unless we're using the fast calling convention). + unsigned NextGPR = PPC::X3; + unsigned NextFPR = PPC::F1; + + // Process arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Handle argument promotion and bitcasts. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false)) + llvm_unreachable("Failed to emit a sext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::AExt: + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true)) + llvm_unreachable("Failed to emit a zext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::BCvt: { + // FIXME: Not yet handled. + llvm_unreachable("Should have bailed before getting here!"); + break; + } + } + + // Copy this argument to the appropriate register. + unsigned ArgReg; + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) { + ArgReg = NextFPR++; + if (CC != CallingConv::Fast) + ++NextGPR; + } else + ArgReg = NextGPR++; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg); + RegArgs.push_back(ArgReg); + } + + return true; +} + +// For a call that we've determined we can fast-select, finish the +// call sequence and generate a copy to obtain the return value (if any). +bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes) { + CallingConv::ID CC = CLI.CallConv; + + // Issue CallSEQ_END. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TII.getCallFrameDestroyOpcode())) + .addImm(NumBytes).addImm(0); + + // Next, generate a copy to obtain the return value. + // FIXME: No multi-register return values yet, though I don't foresee + // any real difficulties there. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + CCValAssign &VA = RVLocs[0]; + assert(RVLocs.size() == 1 && "No support for multi-reg return values!"); + assert(VA.isRegLoc() && "Can only return in registers!"); + + MVT DestVT = VA.getValVT(); + MVT CopyVT = DestVT; + + // Ints smaller than a register still arrive in a full 64-bit + // register, so make sure we recognize this. + if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) + CopyVT = MVT::i64; + + unsigned SourcePhysReg = VA.getLocReg(); + unsigned ResultReg = 0; + + if (RetVT == CopyVT) { + const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT); + ResultReg = createResultReg(CpyRC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + + // If necessary, round the floating result to single precision. + } else if (CopyVT == MVT::f64) { + ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), + ResultReg).addReg(SourcePhysReg); + + // If only the low half of a general register is needed, generate + // a GPRC copy instead of a G8RC copy. (EXTRACT_SUBREG can't be + // used along the fast-isel path (not lowered), and downstream logic + // also doesn't like a direct subreg copy on a physical reg.) + } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) { + ResultReg = createResultReg(&PPC::GPRCRegClass); + // Convert physical register from G8RC to GPRC. + SourcePhysReg -= PPC::X0 - PPC::R0; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + } + + assert(ResultReg && "ResultReg unset!"); + CLI.InRegs.push_back(SourcePhysReg); + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = 1; + } + + return true; +} + +bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { + CallingConv::ID CC = CLI.CallConv; + bool IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + const MCSymbol *Symbol = CLI.Symbol; + + if (!Callee && !Symbol) + return false; + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) + return false; + + // Let SDISel handle vararg functions. + if (IsVarArg) + return false; + + // Handle simple calls for now, with legal return types and + // those that can be extended. + Type *RetTy = CLI.RetTy; + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 && + RetVT != MVT::i8) + return false; + else if (RetVT == MVT::i1 && PPCSubTarget->useCRBits()) + // We can't handle boolean returns when CR bits are in use. + return false; + + // FIXME: No multi-register return values yet. + if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 && + RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 && + RetVT != MVT::f64) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + if (RVLocs.size() > 1) + return false; + } + + // Bail early if more than 8 arguments, as we only currently + // handle arguments passed in registers. + unsigned NumArgs = CLI.OutVals.size(); + if (NumArgs > 8) + return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + + Args.reserve(NumArgs); + ArgRegs.reserve(NumArgs); + ArgVTs.reserve(NumArgs); + ArgFlags.reserve(NumArgs); + + for (unsigned i = 0, ie = NumArgs; i != ie; ++i) { + // Only handle easy calls for now. It would be reasonably easy + // to handle <= 8-byte structures passed ByVal in registers, but we + // have to ensure they are right-justified in the register. + ISD::ArgFlagsTy Flags = CLI.OutFlags[i]; + if (Flags.isInReg() || Flags.isSRet() || Flags.isNest() || Flags.isByVal()) + return false; + + Value *ArgValue = CLI.OutVals[i]; + Type *ArgTy = ArgValue->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8) + return false; + + if (ArgVT.isVector()) + return false; + + unsigned Arg = getRegForValue(ArgValue); + if (Arg == 0) + return false; + + Args.push_back(ArgValue); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Process the arguments. + SmallVector<unsigned, 8> RegArgs; + unsigned NumBytes; + + if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, IsVarArg)) + return false; + + MachineInstrBuilder MIB; + // FIXME: No handling for function pointers yet. This requires + // implementing the function descriptor (OPD) setup. + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV) { + // patchpoints are a special case; they always dispatch to a pointer value. + // However, we don't actually want to generate the indirect call sequence + // here (that will be generated, as necessary, during asm printing), and + // the call we generate here will be erased by FastISel::selectPatchpoint, + // so don't try very hard... + if (CLI.IsPatchPoint) + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::NOP)); + else + return false; + } else { + // Build direct call with NOP for TOC restore. + // FIXME: We can and should optimize away the NOP for local calls. + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::BL8_NOP)); + // Add callee. + MIB.addGlobalAddress(GV); + } + + // Add implicit physical register uses to the call. + for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II) + MIB.addReg(RegArgs[II], RegState::Implicit); + + // Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live + // into the call. + PPCFuncInfo->setUsesTOCBasePtr(); + MIB.addReg(PPC::X2, RegState::Implicit); + + // Add a register mask with the call-preserved registers. Proper + // defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); + + CLI.Call = MIB; + + // Finish off the call including any return values. + return finishCall(RetVT, CLI, NumBytes); +} + +// Attempt to fast-select a return instruction. +bool PPCFastISel::SelectRet(const Instruction *I) { + + if (!FuncInfo.CanLowerReturn) + return false; + + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + CallingConv::ID CC = F.getCallingConv(); + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context); + CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS); + const Value *RV = Ret->getOperand(0); + + // FIXME: Only one output register for now. + if (ValLocs.size() > 1) + return false; + + // Special case for returning a constant integer of any size - materialize + // the constant as an i64 and copy it to the return register. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) { + CCValAssign &VA = ValLocs[0]; + + unsigned RetReg = VA.getLocReg(); + // We still need to worry about properly extending the sign. For example, + // we could have only a single bit or a constant that needs zero + // extension rather than sign extension. Make sure we pass the return + // value extension property to integer materialization. + unsigned SrcReg = + PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); + + RetRegs.push_back(RetReg); + + } else { + unsigned Reg = getRegForValue(RV); + + if (Reg == 0) + return false; + + // Copy the result values into the output registers. + for (unsigned i = 0; i < ValLocs.size(); ++i) { + + CCValAssign &VA = ValLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + RetRegs.push_back(VA.getLocReg()); + unsigned SrcReg = Reg + VA.getValNo(); + + EVT RVEVT = TLI.getValueType(DL, RV->getType()); + if (!RVEVT.isSimple()) + return false; + MVT RVVT = RVEVT.getSimpleVT(); + MVT DestVT = VA.getLocVT(); + + if (RVVT != DestVT && RVVT != MVT::i8 && + RVVT != MVT::i16 && RVVT != MVT::i32) + return false; + + if (RVVT != DestVT) { + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + llvm_unreachable("Full value assign but types don't match?"); + case CCValAssign::AExt: + case CCValAssign::ZExt: { + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true)) + return false; + SrcReg = TmpReg; + break; + } + case CCValAssign::SExt: { + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false)) + return false; + SrcReg = TmpReg; + break; + } + } + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetRegs[i]) + .addReg(SrcReg); + } + } + } + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::BLR8)); + + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + + return true; +} + +// Attempt to emit an integer extend of SrcReg into DestReg. Both +// signed and zero extensions are supported. Return false if we +// can't handle it. +bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + unsigned DestReg, bool IsZExt) { + if (DestVT != MVT::i32 && DestVT != MVT::i64) + return false; + if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32) + return false; + + // Signed extensions use EXTSB, EXTSH, EXTSW. + if (!IsZExt) { + unsigned Opc; + if (SrcVT == MVT::i8) + Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64; + else if (SrcVT == MVT::i16) + Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64; + else { + assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??"); + Opc = PPC::EXTSW_32_64; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(SrcReg); + + // Unsigned 32-bit extensions use RLWINM. + } else if (DestVT == MVT::i32) { + unsigned MB; + if (SrcVT == MVT::i8) + MB = 24; + else { + assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??"); + MB = 16; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM), + DestReg) + .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31); + + // Unsigned 64-bit extensions use RLDICL (with a 32-bit source). + } else { + unsigned MB; + if (SrcVT == MVT::i8) + MB = 56; + else if (SrcVT == MVT::i16) + MB = 48; + else + MB = 32; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::RLDICL_32_64), DestReg) + .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB); + } + + return true; +} + +// Attempt to fast-select an indirect branch instruction. +bool PPCFastISel::SelectIndirectBr(const Instruction *I) { + unsigned AddrReg = getRegForValue(I->getOperand(0)); + if (AddrReg == 0) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8)) + .addReg(AddrReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8)); + + const IndirectBrInst *IB = cast<IndirectBrInst>(I); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); + + return true; +} + +// Attempt to fast-select an integer truncate instruction. +bool PPCFastISel::SelectTrunc(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + + if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16) + return false; + + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // The only interesting case is when we need to switch register classes. + if (SrcVT == MVT::i64) { + unsigned ResultReg = createResultReg(&PPC::GPRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), + ResultReg).addReg(SrcReg, 0, PPC::sub_32); + SrcReg = ResultReg; + } + + updateValueMap(I, SrcReg); + return true; +} + +// Attempt to fast-select an integer extend instruction. +bool PPCFastISel::SelectIntExt(const Instruction *I) { + Type *DestTy = I->getType(); + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + + bool IsZExt = isa<ZExtInst>(I); + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) return false; + + EVT SrcEVT, DestEVT; + SrcEVT = TLI.getValueType(DL, SrcTy, true); + DestEVT = TLI.getValueType(DL, DestTy, true); + if (!SrcEVT.isSimple()) + return false; + if (!DestEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + + // If we know the register class needed for the result of this + // instruction, use it. Otherwise pick the register class of the + // correct size that does not contain X0/R0, since we don't know + // whether downstream uses permit that assignment. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + (AssignedReg ? MRI.getRegClass(AssignedReg) : + (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : + &PPC::GPRC_and_GPRC_NOR0RegClass)); + unsigned ResultReg = createResultReg(RC); + + if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt)) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +// Attempt to fast-select an instruction that wasn't handled by +// the table-generated machinery. +bool PPCFastISel::fastSelectInstruction(const Instruction *I) { + + switch (I->getOpcode()) { + case Instruction::Load: + return SelectLoad(I); + case Instruction::Store: + return SelectStore(I); + case Instruction::Br: + return SelectBranch(I); + case Instruction::IndirectBr: + return SelectIndirectBr(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::SIToFP: + return SelectIToFP(I, /*IsSigned*/ true); + case Instruction::UIToFP: + return SelectIToFP(I, /*IsSigned*/ false); + case Instruction::FPToSI: + return SelectFPToI(I, /*IsSigned*/ true); + case Instruction::FPToUI: + return SelectFPToI(I, /*IsSigned*/ false); + case Instruction::Add: + return SelectBinaryIntOp(I, ISD::ADD); + case Instruction::Or: + return SelectBinaryIntOp(I, ISD::OR); + case Instruction::Sub: + return SelectBinaryIntOp(I, ISD::SUB); + case Instruction::Call: + return selectCall(I); + case Instruction::Ret: + return SelectRet(I); + case Instruction::Trunc: + return SelectTrunc(I); + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntExt(I); + // Here add other flavors of Instruction::XXX that automated + // cases don't catch. For example, switches are terminators + // that aren't yet handled. + default: + break; + } + return false; +} + +// Materialize a floating-point constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { + // No plans to handle long double here. + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + // All FP constants are loaded from the constant pool. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + assert(Align > 0 && "Unexpectedly missing alignment information!"); + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + CodeModel::Model CModel = TM.getCodeModel(); + + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align); + + unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD; + unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + + PPCFuncInfo->setUsesTOCBasePtr(); + // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)). + if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT), + TmpReg) + .addConstantPoolIndex(Idx).addReg(PPC::X2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addImm(0).addReg(TmpReg).addMemOperand(MMO); + } else { + // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx); + // But for large code model, we must generate a LDtocL followed + // by the LF[SD]. + if (CModel == CodeModel::Large) { + unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), + TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addImm(0).addReg(TmpReg2); + } else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO) + .addReg(TmpReg) + .addMemOperand(MMO); + } + + return DestReg; +} + +// Materialize the address of a global value into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { + assert(VT == MVT::i64 && "Non-address!"); + const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass; + unsigned DestReg = createResultReg(RC); + + // Global values may be plain old object addresses, TLS object + // addresses, constant pool entries, or jump tables. How we generate + // code for these may depend on small, medium, or large code model. + CodeModel::Model CModel = TM.getCodeModel(); + + // FIXME: Jump tables are not yet required because fast-isel doesn't + // handle switches; if that changes, we need them as well. For now, + // what follows assumes everything's a generic (or TLS) global address. + + // FIXME: We don't yet handle the complexity of TLS. + if (GV->isThreadLocal()) + return 0; + + PPCFuncInfo->setUsesTOCBasePtr(); + // For small code model, generate a simple TOC load. + if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc), + DestReg) + .addGlobalAddress(GV) + .addReg(PPC::X2); + else { + // If the address is an externally defined symbol, a symbol with common + // or externally available linkage, a non-local function address, or a + // jump table address (not yet needed), or if we are generating code + // for large code model, we generate: + // LDtocL(GV, ADDIStocHA(%X2, GV)) + // Otherwise we generate: + // ADDItocL(ADDIStocHA(%X2, GV), GV) + // Either way, start with the ADDIStocHA: + unsigned HighPartReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); + + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), + DestReg).addGlobalAddress(GV).addReg(HighPartReg); + } else { + // Otherwise generate the ADDItocL. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL), + DestReg).addReg(HighPartReg).addGlobalAddress(GV); + } + } + + return DestReg; +} + +// Materialize a 32-bit integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm, + const TargetRegisterClass *RC) { + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + unsigned ResultReg = createResultReg(RC); + bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + if (isInt<16>(Imm)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg) + .addImm(Imm); + else if (Lo) { + // Both Lo and Hi have nonzero bits. + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg) + .addImm(Hi); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg) + .addReg(TmpReg).addImm(Lo); + } else + // Just Hi bits. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg) + .addImm(Hi); + + return ResultReg; +} + +// Materialize a 64-bit integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm, + const TargetRegisterClass *RC) { + unsigned Remainder = 0; + unsigned Shift = 0; + + // If the value doesn't fit in 32 bits, see if we can shift it + // so that it fits in 32 bits. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros<uint64_t>(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + if (isInt<32>(ImmSh)) + Imm = ImmSh; + else { + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Handle the high-order 32 bits (if shifted) or the whole 32 bits + // (if not shifted). + unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC); + if (!Shift) + return TmpReg1; + + // If upper 32 bits were not zero, we've built them and need to shift + // them into place. + unsigned TmpReg2; + if (Imm) { + TmpReg2 = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR), + TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift); + } else + TmpReg2 = TmpReg1; + + unsigned TmpReg3, Hi, Lo; + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + TmpReg3 = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8), + TmpReg3).addReg(TmpReg2).addImm(Hi); + } else + TmpReg3 = TmpReg2; + + if ((Lo = Remainder & 0xFFFF)) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8), + ResultReg).addReg(TmpReg3).addImm(Lo); + return ResultReg; + } + + return TmpReg3; +} + + +// Materialize an integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt) { + // If we're using CR bit registers for i1 values, handle that as a special + // case first. + if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { + unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg); + return ImmReg; + } + + if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && + VT != MVT::i8 && VT != MVT::i1) + return 0; + + const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : + &PPC::GPRCRegClass); + + // If the constant is in range, use a load-immediate. + if (UseSExt && isInt<16>(CI->getSExtValue())) { + unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; + unsigned ImmReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) + .addImm(CI->getSExtValue()); + return ImmReg; + } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) { + unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; + unsigned ImmReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) + .addImm(CI->getZExtValue()); + return ImmReg; + } + + // Construct the constant piecewise. + int64_t Imm = CI->getZExtValue(); + + if (VT == MVT::i64) + return PPCMaterialize64BitInt(Imm, RC); + else if (VT == MVT::i32) + return PPCMaterialize32BitInt(Imm, RC); + + return 0; +} + +// Materialize a constant into a register, and return the register +// number (or zero if we failed to handle it). +unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(DL, C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return PPCMaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return PPCMaterializeGV(GV, VT); + else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) + return PPCMaterializeInt(CI, VT, VT != MVT::i1); + + return 0; +} + +// Materialize the address created by an alloca into a register, and +// return the register number (or zero if we failed to handle it). +unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) { + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; + + MVT VT; + if (!isLoadTypeLegal(AI->getType(), VT)) return 0; + + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + if (SI != FuncInfo.StaticAllocaMap.end()) { + unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), + ResultReg).addFrameIndex(SI->second).addImm(0); + return ResultReg; + } + + return 0; +} + +// Fold loads into extends when possible. +// FIXME: We can have multiple redundant extend/trunc instructions +// following a load. The folding only picks up one. Extend this +// to check subsequent instructions for the same pattern and remove +// them. Thus ResultReg should be the def reg for the last redundant +// instruction in a chain, and all intervening instructions can be +// removed from parent. Change test/CodeGen/PowerPC/fast-isel-fold.ll +// to add ELF64-NOT: rldicl to the appropriate tests when this works. +bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(LI->getType(), VT)) + return false; + + // Combine load followed by zero- or sign-extend. + bool IsZExt = false; + switch(MI->getOpcode()) { + default: + return false; + + case PPC::RLDICL: + case PPC::RLDICL_32_64: { + IsZExt = true; + unsigned MB = MI->getOperand(3).getImm(); + if ((VT == MVT::i8 && MB <= 56) || + (VT == MVT::i16 && MB <= 48) || + (VT == MVT::i32 && MB <= 32)) + break; + return false; + } + + case PPC::RLWINM: + case PPC::RLWINM8: { + IsZExt = true; + unsigned MB = MI->getOperand(3).getImm(); + if ((VT == MVT::i8 && MB <= 24) || + (VT == MVT::i16 && MB <= 16)) + break; + return false; + } + + case PPC::EXTSB: + case PPC::EXTSB8: + case PPC::EXTSB8_32_64: + /* There is no sign-extending load-byte instruction. */ + return false; + + case PPC::EXTSH: + case PPC::EXTSH8: + case PPC::EXTSH8_32_64: { + if (VT != MVT::i16 && VT != MVT::i8) + return false; + break; + } + + case PPC::EXTSW: + case PPC::EXTSW_32_64: { + if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8) + return false; + break; + } + } + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(LI->getOperand(0), Addr)) + return false; + + unsigned ResultReg = MI->getOperand(0).getReg(); + + if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt)) + return false; + + MI->eraseFromParent(); + return true; +} + +// Attempt to lower call arguments in a faster way than done by +// the selection DAG code. +bool PPCFastISel::fastLowerArguments() { + // Defer to normal argument lowering for now. It's reasonably + // efficient. Consider doing something like ARM to handle the + // case where all args fit in registers, no varargs, no float + // or vector args. + return false; +} + +// Handle materializing integer constants into a register. This is not +// automatically generated for PowerPC, so must be explicitly created here. +unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) { + + if (Opc != ISD::Constant) + return 0; + + // If we're using CR bit registers for i1 values, handle that as a special + // case first. + if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { + unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg); + return ImmReg; + } + + if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && + VT != MVT::i8 && VT != MVT::i1) + return 0; + + const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : + &PPC::GPRCRegClass); + if (VT == MVT::i64) + return PPCMaterialize64BitInt(Imm, RC); + else + return PPCMaterialize32BitInt(Imm, RC); +} + +// Override for ADDI and ADDI8 to set the correct register class +// on RHS operand 0. The automatic infrastructure naively assumes +// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost +// for these cases. At the moment, none of the other automatically +// generated RI instructions require special treatment. However, once +// SelectSelect is implemented, "isel" requires similar handling. +// +// Also be conservative about the output register class. Avoid +// assigning R0 or X0 to the output register for GPRC and G8RC +// register classes, as any such result could be used in ADDI, etc., +// where those regs have another meaning. +unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm) { + if (MachineInstOpcode == PPC::ADDI) + MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass); + else if (MachineInstOpcode == PPC::ADDI8) + MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass); + + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC, + Op0, Op0IsKill, Imm); +} + +// Override for instructions with one register operand to avoid use of +// R0/X0. The automatic infrastructure isn't aware of the context so +// we must be conservative. +unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass* RC, + unsigned Op0, bool Op0IsKill) { + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill); +} + +// Override for instructions with two register operands to avoid use +// of R0/X0. The automatic infrastructure isn't aware of the context +// so we must be conservative. +unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass* RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill, + Op1, Op1IsKill); +} + +namespace llvm { + // Create the fast instruction selector for PowerPC64 ELF. + FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + // Only available on 64-bit ELF for now. + const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>(); + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) + return new PPCFastISel(FuncInfo, LibInfo); + return nullptr; + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp new file mode 100644 index 0000000..beab844 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -0,0 +1,1780 @@ +//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PPC implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "PPCFrameLowering.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +/// VRRegNo - Map from a numbered VR register to its enum value. +/// +static const MCPhysReg VRRegNo[] = { + PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , + PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; + +static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) { + if (STI.isDarwinABI()) + return STI.isPPC64() ? 16 : 8; + // SVR4 ABI: + return STI.isPPC64() ? 16 : 4; +} + +static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) { + return STI.isELFv2ABI() ? 24 : 40; +} + +static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) { + // For the Darwin ABI: + // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area + // for saving the frame pointer (if needed.) While the published ABI has + // not used this slot since at least MacOSX 10.2, there is older code + // around that does use it, and that needs to continue to work. + if (STI.isDarwinABI()) + return STI.isPPC64() ? -8U : -4U; + + // SVR4 ABI: First slot in the general register save area. + return STI.isPPC64() ? -8U : -4U; +} + +static unsigned computeLinkageSize(const PPCSubtarget &STI) { + if (STI.isDarwinABI() || STI.isPPC64()) + return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4); + + // SVR4 ABI: + return 8; +} + +static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) { + if (STI.isDarwinABI()) + return STI.isPPC64() ? -16U : -8U; + + // SVR4 ABI: First slot in the general register save area. + return STI.isPPC64() + ? -16U + : (STI.getTargetMachine().getRelocationModel() == Reloc::PIC_) + ? -12U + : -8U; +} + +PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, + STI.getPlatformStackAlignment(), 0), + Subtarget(STI), ReturnSaveOffset(computeReturnSaveOffset(Subtarget)), + TOCSaveOffset(computeTOCSaveOffset(Subtarget)), + FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)), + LinkageSize(computeLinkageSize(Subtarget)), + BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {} + +// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. +const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( + unsigned &NumEntries) const { + if (Subtarget.isDarwinABI()) { + NumEntries = 1; + if (Subtarget.isPPC64()) { + static const SpillSlot darwin64Offsets = {PPC::X31, -8}; + return &darwin64Offsets; + } else { + static const SpillSlot darwinOffsets = {PPC::R31, -4}; + return &darwinOffsets; + } + } + + // Early exit if not using the SVR4 ABI. + if (!Subtarget.isSVR4ABI()) { + NumEntries = 0; + return nullptr; + } + + // Note that the offsets here overlap, but this is fixed up in + // processFunctionBeforeFrameFinalized. + + static const SpillSlot Offsets[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::R31, -4}, + {PPC::R30, -8}, + {PPC::R29, -12}, + {PPC::R28, -16}, + {PPC::R27, -20}, + {PPC::R26, -24}, + {PPC::R25, -28}, + {PPC::R24, -32}, + {PPC::R23, -36}, + {PPC::R22, -40}, + {PPC::R21, -44}, + {PPC::R20, -48}, + {PPC::R19, -52}, + {PPC::R18, -56}, + {PPC::R17, -60}, + {PPC::R16, -64}, + {PPC::R15, -68}, + {PPC::R14, -72}, + + // CR save area offset. We map each of the nonvolatile CR fields + // to the slot for CR2, which is the first of the nonvolatile CR + // fields to be assigned, so that we only allocate one save slot. + // See PPCRegisterInfo::hasReservedSpillSlot() for more information. + {PPC::CR2, -4}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + static const SpillSlot Offsets64[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::X31, -8}, + {PPC::X30, -16}, + {PPC::X29, -24}, + {PPC::X28, -32}, + {PPC::X27, -40}, + {PPC::X26, -48}, + {PPC::X25, -56}, + {PPC::X24, -64}, + {PPC::X23, -72}, + {PPC::X22, -80}, + {PPC::X21, -88}, + {PPC::X20, -96}, + {PPC::X19, -104}, + {PPC::X18, -112}, + {PPC::X17, -120}, + {PPC::X16, -128}, + {PPC::X15, -136}, + {PPC::X14, -144}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + if (Subtarget.isPPC64()) { + NumEntries = array_lengthof(Offsets64); + + return Offsets64; + } else { + NumEntries = array_lengthof(Offsets); + + return Offsets; + } +} + +/// RemoveVRSaveCode - We have found that this function does not need any code +/// to manipulate the VRSAVE register, even though it uses vector registers. +/// This can happen when the only registers used are known to be live in or out +/// of the function. Remove all of the VRSAVE related code from the function. +/// FIXME: The removal of the code results in a compile failure at -O0 when the +/// function contains a function call, as the GPR containing original VRSAVE +/// contents is spilled and reloaded around the call. Without the prolog code, +/// the spill instruction refers to an undefined register. This code needs +/// to account for all uses of that GPR. +static void RemoveVRSaveCode(MachineInstr *MI) { + MachineBasicBlock *Entry = MI->getParent(); + MachineFunction *MF = Entry->getParent(); + + // We know that the MTVRSAVE instruction immediately follows MI. Remove it. + MachineBasicBlock::iterator MBBI = MI; + ++MBBI; + assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE); + MBBI->eraseFromParent(); + + bool RemovedAllMTVRSAVEs = true; + // See if we can find and remove the MTVRSAVE instruction from all of the + // epilog blocks. + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { + // If last instruction is a return instruction, add an epilogue + if (I->isReturnBlock()) { + bool FoundIt = false; + for (MBBI = I->end(); MBBI != I->begin(); ) { + --MBBI; + if (MBBI->getOpcode() == PPC::MTVRSAVE) { + MBBI->eraseFromParent(); // remove it. + FoundIt = true; + break; + } + } + RemovedAllMTVRSAVEs &= FoundIt; + } + } + + // If we found and removed all MTVRSAVE instructions, remove the read of + // VRSAVE as well. + if (RemovedAllMTVRSAVEs) { + MBBI = MI; + assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?"); + --MBBI; + assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?"); + MBBI->eraseFromParent(); + } + + // Finally, nuke the UPDATE_VRSAVE. + MI->eraseFromParent(); +} + +// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the +// instruction selector. Based on the vector registers that have been used, +// transform this into the appropriate ORI instruction. +static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { + MachineFunction *MF = MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + DebugLoc dl = MI->getDebugLoc(); + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned UsedRegMask = 0; + for (unsigned i = 0; i != 32; ++i) + if (MRI.isPhysRegModified(VRRegNo[i])) + UsedRegMask |= 1 << (31-i); + + // Live in and live out values already must be in the mask, so don't bother + // marking them. + for (MachineRegisterInfo::livein_iterator + I = MF->getRegInfo().livein_begin(), + E = MF->getRegInfo().livein_end(); I != E; ++I) { + unsigned RegNo = TRI->getEncodingValue(I->first); + if (VRRegNo[RegNo] == I->first) // If this really is a vector reg. + UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. + } + + // Live out registers appear as use operands on return instructions. + for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end(); + UsedRegMask != 0 && BI != BE; ++BI) { + const MachineBasicBlock &MBB = *BI; + if (!MBB.isReturnBlock()) + continue; + const MachineInstr &Ret = MBB.back(); + for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) { + const MachineOperand &MO = Ret.getOperand(I); + if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg())) + continue; + unsigned RegNo = TRI->getEncodingValue(MO.getReg()); + UsedRegMask &= ~(1 << (31-RegNo)); + } + } + + // If no registers are used, turn this into a copy. + if (UsedRegMask == 0) { + // Remove all VRSAVE code. + RemoveVRSaveCode(MI); + return; + } + + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + + if ((UsedRegMask & 0xFFFF) == UsedRegMask) { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask); + } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask >> 16); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask >> 16); + } else { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask >> 16); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask >> 16); + + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(DstReg, RegState::Kill) + .addImm(UsedRegMask & 0xFFFF); + } + + // Remove the old UPDATE_VRSAVE instruction. + MI->eraseFromParent(); +} + +static bool spillsCR(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->isCRSpilled(); +} + +static bool spillsVRSAVE(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->isVRSAVESpilled(); +} + +static bool hasSpills(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->hasSpills(); +} + +static bool hasNonRISpills(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->hasNonRISpills(); +} + +/// MustSaveLR - Return true if this function requires that we save the LR +/// register onto the stack in the prolog and restore it in the epilog of the +/// function. +static bool MustSaveLR(const MachineFunction &MF, unsigned LR) { + const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>(); + + // We need a save/restore of LR if there is any def of LR (which is + // defined by calls, including the PIC setup sequence), or if there is + // some use of the LR stack slot (e.g. for builtin_return_address). + // (LR comes in 32 and 64 bit versions.) + MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR); + return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired(); +} + +/// determineFrameLayout - Determine the size of the frame and maximum call +/// frame size. +unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, + bool UpdateMF, + bool UseEstimate) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + unsigned FrameSize = + UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize(); + + // Get stack alignments. The frame must be aligned to the greatest of these: + unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI + unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame + unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1; + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + + // If we are a leaf function, and use up to 224 bytes of stack space, + // don't have a frame pointer, calls, or dynamic alloca then we do not need + // to adjust the stack pointer (we fit in the Red Zone). + // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate + // stackless code if all local vars are reg-allocated. + bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); + unsigned LR = RegInfo->getRARegister(); + if (!DisableRedZone && + (Subtarget.isPPC64() || // 32-bit SVR4, no stack- + !Subtarget.isSVR4ABI() || // allocated locals. + FrameSize == 0) && + FrameSize <= 224 && // Fits in red zone. + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !MustSaveLR(MF, LR) && + !RegInfo->hasBasePointer(MF)) { // No special alignment. + // No need for frame + if (UpdateMF) + MFI->setStackSize(0); + return 0; + } + + // Get the maximum call frame size of all the calls. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + + // Maximum call frame needs to be at least big enough for linkage area. + unsigned minCallFrameSize = getLinkageSize(); + maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize); + + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so + // that allocations will be aligned. + if (MFI->hasVarSizedObjects()) + maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; + + // Update maximum call frame size. + if (UpdateMF) + MFI->setMaxCallFrameSize(maxCallFrameSize); + + // Include call frame size in total. + FrameSize += maxCallFrameSize; + + // Make sure the frame is aligned. + FrameSize = (FrameSize + AlignMask) & ~AlignMask; + + // Update frame info. + if (UpdateMF) + MFI->setStackSize(FrameSize); + + return FrameSize; +} + +// hasFP - Return true if the specified function actually has a dedicated frame +// pointer register. +bool PPCFrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // FIXME: This is pretty much broken by design: hasFP() might be called really + // early, before the stack layout was calculated and thus hasFP() might return + // true or false here depending on the time of call. + return (MFI->getStackSize()) && needsFP(MF); +} + +// needsFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +bool PPCFrameLowering::needsFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Naked functions have no stack frame pushed, so we don't have a frame + // pointer. + if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + return false; + + return MF.getTarget().Options.DisableFramePointerElim(MF) || + MFI->hasVarSizedObjects() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + (MF.getTarget().Options.GuaranteedTailCallOpt && + MF.getInfo<PPCFunctionInfo>()->hasFastCall()); +} + +void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { + bool is31 = needsFP(MF); + unsigned FPReg = is31 ? PPC::R31 : PPC::R1; + unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1; + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + bool HasBP = RegInfo->hasBasePointer(MF); + unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; + unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) + for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) { + --MBBI; + for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { + MachineOperand &MO = MBBI->getOperand(I); + if (!MO.isReg()) + continue; + + switch (MO.getReg()) { + case PPC::FP: + MO.setReg(FPReg); + break; + case PPC::FP8: + MO.setReg(FP8Reg); + break; + case PPC::BP: + MO.setReg(BPReg); + break; + case PPC::BP8: + MO.setReg(BP8Reg); + break; + + } + } + } +} + +bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const { + RegScavenger RS; + unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0; + + if (ScratchRegister) + *ScratchRegister = R0; + + // If MBB is an entry or exit block, use R0 as the scratch register + if ((UseAtEnd && MBB->isReturnBlock()) || + (!UseAtEnd && (&MBB->getParent()->front() == MBB))) + return true; + + RS.enterBasicBlock(MBB); + + if (UseAtEnd && !MBB->empty()) { + // The scratch register will be used at the end of the block, so must consider + // all registers used within the block + + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + // If no terminator, back iterator up to previous instruction. + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } + + if (!RS.isRegUsed(R0)) + return true; + + unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass); + + // Make sure the register scavenger was able to find an available register + // If not, use R0 but return false to indicate no register was available and + // R0 must be used (as recommended by the ABI) + if (Reg == 0) + return false; + + if (ScratchRegister) + *ScratchRegister = Reg; + + return true; +} + +bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, false, nullptr); +} + +bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, true, nullptr); +} + +void PPCFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + DebugLoc dl; + bool needsCFI = MMI.hasDebugInfo() || + MF.getFunction()->needsUnwindTableEntry(); + + // Get processor type. + bool isPPC64 = Subtarget.isPPC64(); + // Get the ABI. + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); + assert((Subtarget.isDarwinABI() || isSVR4ABI) && + "Currently only Darwin and SVR4 ABIs are supported for PowerPC."); + + // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it, + // process it. + if (!isSVR4ABI) + for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) { + if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) { + HandleVRSaveUpdate(MBBI, TII); + break; + } + } + + // Move MBBI back to the beginning of the prologue block. + MBBI = MBB.begin(); + + // Work out frame sizes. + unsigned FrameSize = determineFrameLayout(MF); + int NegFrameSize = -FrameSize; + if (!isInt<32>(NegFrameSize)) + llvm_unreachable("Unhandled stack size!"); + + if (MFI->isFrameAddressTaken()) + replaceFPWithRealFP(MF); + + // Check if the link register (LR) must be saved. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + bool MustSaveLR = FI->mustSaveLR(); + const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs(); + // Do we have a frame pointer and/or base pointer for this function? + bool HasFP = hasFP(MF); + bool HasBP = RegInfo->hasBasePointer(MF); + + unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; + unsigned BPReg = RegInfo->getBaseRegister(MF); + unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; + unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; + unsigned ScratchReg = 0; + unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg + // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) + const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8 + : PPC::MFLR ); + const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD + : PPC::STW ); + const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU + : PPC::STWU ); + const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX + : PPC::STWUX); + const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8 + : PPC::LIS ); + const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8 + : PPC::ORI ); + const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8 + : PPC::OR ); + const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8 + : PPC::SUBFC); + const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8 + : PPC::SUBFIC); + + // Regarding this assert: Even though LR is saved in the caller's frame (i.e., + // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no + // Red Zone, an asynchronous event (a form of "callee") could claim a frame & + // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR. + assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) && + "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); + + findScratchRegister(&MBB, false, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + + int LROffset = getReturnSaveOffset(); + + int FPOffset = 0; + if (HasFP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = FFI->getObjectOffset(FPIndex); + } else { + FPOffset = getFramePointerSaveOffset(); + } + } + + int BPOffset = 0; + if (HasBP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int BPIndex = FI->getBasePointerSaveIndex(); + assert(BPIndex && "No Base Pointer Save Slot!"); + BPOffset = FFI->getObjectOffset(BPIndex); + } else { + BPOffset = getBasePointerSaveOffset(); + } + } + + int PBPOffset = 0; + if (FI->usesPICBase()) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int PBPIndex = FI->getPICBasePointerSaveIndex(); + assert(PBPIndex && "No PIC Base Pointer Save Slot!"); + PBPOffset = FFI->getObjectOffset(PBPIndex); + } + + // Get stack alignments. + unsigned MaxAlign = MFI->getMaxAlignment(); + if (HasBP && MaxAlign > 1) + assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) && + "Invalid alignment!"); + + // Frames of 32KB & larger require special handling because they cannot be + // indexed into with a simple STDU/STWU/STD/STW immediate offset operand. + bool isLargeFrame = !isInt<16>(NegFrameSize); + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg); + + assert((isPPC64 || MustSaveCRs.empty()) && + "Prologue CR saving supported only in 64-bit mode"); + + if (!MustSaveCRs.empty()) { // will only occur for PPC64 + // FIXME: In the ELFv2 ABI, we are not required to save all CR fields. + // If only one or two CR fields are clobbered, it could be more + // efficient to use mfocrf to selectively save just those fields. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg); + for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) + MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill); + } + + if (HasFP) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(FPReg) + .addImm(FPOffset) + .addReg(SPReg); + + if (FI->usesPICBase()) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(PPC::R30) + .addImm(PBPOffset) + .addReg(SPReg); + + if (HasBP) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(BPReg) + .addImm(BPOffset) + .addReg(SPReg); + + if (MustSaveLR) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(ScratchReg) + .addImm(LROffset) + .addReg(SPReg); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) + .addReg(TempReg, getKillRegState(true)) + .addImm(8) + .addReg(SPReg); + + // Skip the rest if this is a leaf function & all spills fit in the Red Zone. + if (!FrameSize) return; + + // Adjust stack pointer: r1 += NegFrameSize. + // If there is a preferred stack alignment, align R1 now + + if (HasBP) { + // Save a copy of r1 as the base pointer. + BuildMI(MBB, MBBI, dl, OrInst, BPReg) + .addReg(SPReg) + .addReg(SPReg); + } + + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(64 - Log2_32(MaxAlign)); + else // PPC32... + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(32 - Log2_32(MaxAlign)) + .addImm(31); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) + .addReg(TempReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addReg(TempReg, RegState::Kill); + } + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + + } else if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg) + .addReg(SPReg) + .addImm(NegFrameSize) + .addReg(SPReg); + + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + } + + // Add Call Frame Information for the instructions we generated above. + if (needsCFI) { + unsigned CFIIndex; + + if (HasBP) { + // Define CFA in terms of BP. Do this in preference to using FP/SP, + // because if the stack needed aligning then CFA won't be at a fixed + // offset from FP/SP. + unsigned Reg = MRI->getDwarfRegNum(BPReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + } else { + // Adjust the definition of CFA to account for the change in SP. + assert(NegFrameSize); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize)); + } + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + if (HasFP) { + // Describe where FP was saved, at a fixed offset from CFA. + unsigned Reg = MRI->getDwarfRegNum(FPReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, FPOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (FI->usesPICBase()) { + // Describe where FP was saved, at a fixed offset from CFA. + unsigned Reg = MRI->getDwarfRegNum(PPC::R30, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, PBPOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (HasBP) { + // Describe where BP was saved, at a fixed offset from CFA. + unsigned Reg = MRI->getDwarfRegNum(BPReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, BPOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (MustSaveLR) { + // Describe where LR was saved, at a fixed offset from CFA. + unsigned Reg = MRI->getDwarfRegNum(LRReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, LROffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + // If there is a frame pointer, copy R1 into R31 + if (HasFP) { + BuildMI(MBB, MBBI, dl, OrInst, FPReg) + .addReg(SPReg) + .addReg(SPReg); + + if (!HasBP && needsCFI) { + // Change the definition of CFA from SP+offset to FP+offset, because SP + // will change at every alloca. + unsigned Reg = MRI->getDwarfRegNum(FPReg, true); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + if (needsCFI) { + // Describe where callee saved registers were saved, at fixed offsets from + // CFA. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue; + + // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just + // subregisters of CR2. We just need to emit a move of CR2. + if (PPC::CRBITRCRegClass.contains(Reg)) + continue; + + // For SVR4, don't emit a move for the CR spill slot if we haven't + // spilled CRs. + if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4) + && MustSaveCRs.empty()) + continue; + + // For 64-bit SVR4 when we have spilled CRs, the spill location + // is SP+8, not a frame-relative slot. + if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) { + // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for + // the whole CR word. In the ELFv2 ABI, every CR that was + // actually saved gets its own CFI record. + unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2; + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(CRReg, true), 8)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + continue; + } + + int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } +} + +void PPCFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl; + + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + + // Get alignment info so we know how to restore the SP. + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes allocated from the FrameInfo. + int FrameSize = MFI->getStackSize(); + + // Get processor type. + bool isPPC64 = Subtarget.isPPC64(); + // Get the ABI. + bool isSVR4ABI = Subtarget.isSVR4ABI(); + + // Check if the link register (LR) has been saved. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + bool MustSaveLR = FI->mustSaveLR(); + const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs(); + // Do we have a frame pointer and/or base pointer for this function? + bool HasFP = hasFP(MF); + bool HasBP = RegInfo->hasBasePointer(MF); + + unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; + unsigned BPReg = RegInfo->getBaseRegister(MF); + unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; + unsigned ScratchReg = 0; + unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg + const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8 + : PPC::MTLR ); + const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD + : PPC::LWZ ); + const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8 + : PPC::LIS ); + const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8 + : PPC::ORI ); + const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8 + : PPC::ADDI ); + const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8 + : PPC::ADD4 ); + + int LROffset = getReturnSaveOffset(); + + int FPOffset = 0; + + findScratchRegister(&MBB, true, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + + if (HasFP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = FFI->getObjectOffset(FPIndex); + } else { + FPOffset = getFramePointerSaveOffset(); + } + } + + int BPOffset = 0; + if (HasBP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int BPIndex = FI->getBasePointerSaveIndex(); + assert(BPIndex && "No Base Pointer Save Slot!"); + BPOffset = FFI->getObjectOffset(BPIndex); + } else { + BPOffset = getBasePointerSaveOffset(); + } + } + + int PBPOffset = 0; + if (FI->usesPICBase()) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int PBPIndex = FI->getPICBasePointerSaveIndex(); + assert(PBPIndex && "No PIC Base Pointer Save Slot!"); + PBPOffset = FFI->getObjectOffset(PBPIndex); + } + + bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn()); + + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + bool UsesTCRet = RetOpcode == PPC::TCRETURNri || + RetOpcode == PPC::TCRETURNdi || + RetOpcode == PPC::TCRETURNai || + RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNdi8 || + RetOpcode == PPC::TCRETURNai8; + + if (UsesTCRet) { + int MaxTCRetDelta = FI->getTailCallSPDelta(); + MachineOperand &StackAdjust = MBBI->getOperand(1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int Delta = StackAdj - MaxTCRetDelta; + assert((Delta >= 0) && "Delta must be positive"); + if (MaxTCRetDelta>0) + FrameSize += (StackAdj +Delta); + else + FrameSize += StackAdj; + } + } + + // Frames of 32KB & larger require special handling because they cannot be + // indexed into with a simple LD/LWZ immediate offset operand. + bool isLargeFrame = !isInt<16>(FrameSize); + + if (FrameSize) { + // In the prologue, the loaded (or persistent) stack pointer value is offset + // by the STDU/STDUX/STWU/STWUX instruction. Add this offset back now. + + // If this function contained a fastcc call and GuaranteedTailCallOpt is + // enabled (=> hasFastCall()==true) the fastcc call might contain a tail + // call which invalidates the stack pointer value in SP(0). So we use the + // value of R31 in this case. + if (FI->hasFastCall()) { + assert(HasFP && "Expecting a valid frame pointer."); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(FPReg).addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(FrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(FrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, AddInst) + .addReg(SPReg) + .addReg(FPReg) + .addReg(ScratchReg); + } + } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg) + .addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadInst, SPReg) + .addImm(0) + .addReg(SPReg); + } + } + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg) + .addImm(LROffset) + .addReg(SPReg); + + assert((isPPC64 || MustSaveCRs.empty()) && + "Epilogue CR restoring supported only in 64-bit mode"); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) + .addImm(8) + .addReg(SPReg); + + if (HasFP) + BuildMI(MBB, MBBI, dl, LoadInst, FPReg) + .addImm(FPOffset) + .addReg(SPReg); + + if (FI->usesPICBase()) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, LoadInst) + .addReg(PPC::R30) + .addImm(PBPOffset) + .addReg(SPReg); + + if (HasBP) + BuildMI(MBB, MBBI, dl, LoadInst, BPReg) + .addImm(BPOffset) + .addReg(SPReg); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) + BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i]) + .addReg(TempReg, getKillRegState(i == e-1)); + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg); + + // Callee pop calling convention. Pop parameter/linkage area. Used for tail + // call optimization + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && + MF.getFunction()->getCallingConv() == CallingConv::Fast) { + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned CallerAllocatedAmt = FI->getMinReservedArea(); + + if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg).addImm(CallerAllocatedAmt); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(CallerAllocatedAmt >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(CallerAllocatedAmt & 0xFFFF); + BuildMI(MBB, MBBI, dl, AddInst) + .addReg(SPReg) + .addReg(FPReg) + .addReg(ScratchReg); + } + } else if (RetOpcode == PPC::TCRETURNdi) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); + } else if (RetOpcode == PPC::TCRETURNai) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); + } else if (RetOpcode == PPC::TCRETURNdi8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri8) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); + } else if (RetOpcode == PPC::TCRETURNai8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } + } +} + +void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + + // Save and clear the LR state. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned LR = RegInfo->getRARegister(); + FI->setMustSaveLR(MustSaveLR(MF, LR)); + SavedRegs.reset(LR); + + // Save R31 if necessary + int FPSI = FI->getFramePointerSaveIndex(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI && needsFP(MF)) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = getFramePointerSaveOffset(); + // Allocate the frame index for frame pointer save area. + FPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + + int BPSI = FI->getBasePointerSaveIndex(); + if (!BPSI && RegInfo->hasBasePointer(MF)) { + int BPOffset = getBasePointerSaveOffset(); + // Allocate the frame index for the base pointer save area. + BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true); + // Save the result. + FI->setBasePointerSaveIndex(BPSI); + } + + // Reserve stack space for the PIC Base register (R30). + // Only used in SVR4 32-bit. + if (FI->usesPICBase()) { + int PBPSI = MFI->CreateFixedObject(4, -8, true); + FI->setPICBasePointerSaveIndex(PBPSI); + } + + // Reserve stack space to move the linkage area to in case of a tail call. + int TCSPDelta = 0; + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (TCSPDelta = FI->getTailCallSPDelta()) < 0) { + MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true); + } + + // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the + // function uses CR 2, 3, or 4. + if (!isPPC64 && !isDarwinABI && + (SavedRegs.test(PPC::CR2) || + SavedRegs.test(PPC::CR3) || + SavedRegs.test(PPC::CR4))) { + int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true); + FI->setCRSpillFrameIndex(FrameIdx); + } +} + +void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const { + // Early exit if not using the SVR4 ABI. + if (!Subtarget.isSVR4ABI()) { + addScavengingSpillSlot(MF, RS); + return; + } + + // Get callee saved register information. + MachineFrameInfo *FFI = MF.getFrameInfo(); + const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo(); + + // Early exit if no callee saved registers are modified! + if (CSI.empty() && !needsFP(MF)) { + addScavengingSpillSlot(MF, RS); + return; + } + + unsigned MinGPR = PPC::R31; + unsigned MinG8R = PPC::X31; + unsigned MinFPR = PPC::F31; + unsigned MinVR = PPC::V31; + + bool HasGPSaveArea = false; + bool HasG8SaveArea = false; + bool HasFPSaveArea = false; + bool HasVRSAVESaveArea = false; + bool HasVRSaveArea = false; + + SmallVector<CalleeSavedInfo, 18> GPRegs; + SmallVector<CalleeSavedInfo, 18> G8Regs; + SmallVector<CalleeSavedInfo, 18> FPRegs; + SmallVector<CalleeSavedInfo, 18> VRegs; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (PPC::GPRCRegClass.contains(Reg)) { + HasGPSaveArea = true; + + GPRegs.push_back(CSI[i]); + + if (Reg < MinGPR) { + MinGPR = Reg; + } + } else if (PPC::G8RCRegClass.contains(Reg)) { + HasG8SaveArea = true; + + G8Regs.push_back(CSI[i]); + + if (Reg < MinG8R) { + MinG8R = Reg; + } + } else if (PPC::F8RCRegClass.contains(Reg)) { + HasFPSaveArea = true; + + FPRegs.push_back(CSI[i]); + + if (Reg < MinFPR) { + MinFPR = Reg; + } + } else if (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)) { + ; // do nothing, as we already know whether CRs are spilled + } else if (PPC::VRSAVERCRegClass.contains(Reg)) { + HasVRSAVESaveArea = true; + } else if (PPC::VRRCRegClass.contains(Reg)) { + HasVRSaveArea = true; + + VRegs.push_back(CSI[i]); + + if (Reg < MinVR) { + MinVR = Reg; + } + } else { + llvm_unreachable("Unknown RegisterClass!"); + } + } + + PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + int64_t LowerBound = 0; + + // Take into account stack space reserved for tail calls. + int TCSPDelta = 0; + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (TCSPDelta = PFI->getTailCallSPDelta()) < 0) { + LowerBound = TCSPDelta; + } + + // The Floating-point register save area is right below the back chain word + // of the previous stack frame. + if (HasFPSaveArea) { + for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) { + int FI = FPRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8; + } + + // Check whether the frame pointer register is allocated. If so, make sure it + // is spilled to the correct offset. + if (needsFP(MF)) { + HasGPSaveArea = true; + + int FI = PFI->getFramePointerSaveIndex(); + assert(FI && "No Frame Pointer Save Slot!"); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + if (PFI->usesPICBase()) { + HasGPSaveArea = true; + + int FI = PFI->getPICBasePointerSaveIndex(); + assert(FI && "No PIC Base Pointer Save Slot!"); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + if (RegInfo->hasBasePointer(MF)) { + HasGPSaveArea = true; + + int FI = PFI->getBasePointerSaveIndex(); + assert(FI && "No Base Pointer Save Slot!"); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + // General register save area starts right below the Floating-point + // register save area. + if (HasGPSaveArea || HasG8SaveArea) { + // Move general register save area spill slots down, taking into account + // the size of the Floating-point register save area. + for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) { + int FI = GPRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + // Move general register save area spill slots down, taking into account + // the size of the Floating-point register save area. + for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) { + int FI = G8Regs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + unsigned MinReg = + std::min<unsigned>(TRI->getEncodingValue(MinGPR), + TRI->getEncodingValue(MinG8R)); + + if (Subtarget.isPPC64()) { + LowerBound -= (31 - MinReg + 1) * 8; + } else { + LowerBound -= (31 - MinReg + 1) * 4; + } + } + + // For 32-bit only, the CR save area is below the general register + // save area. For 64-bit SVR4, the CR save area is addressed relative + // to the stack pointer and hence does not need an adjustment here. + // Only CR2 (the first nonvolatile spilled) has an associated frame + // index so that we have a single uniform save area. + if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) { + // Adjust the frame index of the CR spill slot. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2) + // Leave Darwin logic as-is. + || (!Subtarget.isSVR4ABI() && + (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)))) { + int FI = CSI[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + LowerBound -= 4; // The CR save area is always 4 bytes long. + } + + if (HasVRSAVESaveArea) { + // FIXME SVR4: Is it actually possible to have multiple elements in CSI + // which have the VRSAVE register class? + // Adjust the frame index of the VRSAVE spill slot. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + if (PPC::VRSAVERCRegClass.contains(Reg)) { + int FI = CSI[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + LowerBound -= 4; // The VRSAVE save area is always 4 bytes long. + } + + if (HasVRSaveArea) { + // Insert alignment padding, we need 16-byte alignment. + LowerBound = (LowerBound - 15) & ~(15); + + for (unsigned i = 0, e = VRegs.size(); i != e; ++i) { + int FI = VRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + addScavengingSpillSlot(MF, RS); +} + +void +PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, + RegScavenger *RS) const { + // Reserve a slot closest to SP or frame pointer if we have a dynalloc or + // a large stack, which will require scavenging a register to materialize a + // large offset. + + // We need to have a scavenger spill slot for spills if the frame size is + // large. In case there is no free register for large-offset addressing, + // this slot is used for the necessary emergency spill. Also, we need the + // slot for dynamic stack allocations. + + // The scavenger might be invoked if the frame offset does not fit into + // the 16-bit immediate. We don't know the complete frame size here + // because we've not yet computed callee-saved register spills or the + // needed alignment padding. + unsigned StackSize = determineFrameLayout(MF, false, true); + MachineFrameInfo *MFI = MF.getFrameInfo(); + if (MFI->hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || + hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC; + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + + // Might we have over-aligned allocas? + bool HasAlVars = MFI->hasVarSizedObjects() && + MFI->getMaxAlignment() > getStackAlignment(); + + // These kinds of spills might need two registers. + if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + + } +} + +bool +PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + + // Currently, this function only handles SVR4 32- and 64-bit ABIs. + // Return false otherwise to maintain pre-existing behavior. + if (!Subtarget.isSVR4ABI()) + return false; + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + DebugLoc DL; + bool CRSpilled = false; + MachineInstrBuilder CRMIB; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + // Only Darwin actually uses the VRSAVE register, but it can still appear + // here if, for example, @llvm.eh.unwind.init() is used. If we're not on + // Darwin, ignore it. + if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) + continue; + + // CR2 through CR4 are the nonvolatile CR fields. + bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; + + // Add the callee-saved register as live-in; it's killed at the spill. + MBB.addLiveIn(Reg); + + if (CRSpilled && IsCRField) { + CRMIB.addReg(Reg, RegState::ImplicitKill); + continue; + } + + // Insert the spill to the stack frame. + if (IsCRField) { + PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>(); + if (Subtarget.isPPC64()) { + // The actual spill will happen at the start of the prologue. + FuncInfo->addMustSaveCR(Reg); + } else { + CRSpilled = true; + FuncInfo->setSpillsCR(); + + // 32-bit: FP-relative. Note that we made sure CR2-CR4 all have + // the same frame index in PPCRegisterInfo::hasReservedSpillSlot. + CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12) + .addReg(Reg, RegState::ImplicitKill); + + MBB.insert(MI, CRMIB); + MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW)) + .addReg(PPC::R12, + getKillRegState(true)), + CSI[i].getFrameIdx())); + } + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, + CSI[i].getFrameIdx(), RC, TRI); + } + } + return true; +} + +static void +restoreCRs(bool isPPC64, bool is31, + bool CR2Spilled, bool CR3Spilled, bool CR4Spilled, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) { + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = *MF->getSubtarget<PPCSubtarget>().getInstrInfo(); + DebugLoc DL; + unsigned RestoreOp, MoveReg; + + if (isPPC64) + // This is handled during epilogue generation. + return; + else { + // 32-bit: FP-relative + MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ), + PPC::R12), + CSI[CSIIndex].getFrameIdx())); + RestoreOp = PPC::MTOCRF; + MoveReg = PPC::R12; + } + + if (CR2Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2) + .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled))); + + if (CR3Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3) + .addReg(MoveReg, getKillRegState(!CR4Spilled))); + + if (CR4Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4) + .addReg(MoveReg, getKillRegState(true))); +} + +void PPCFrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + if (MF.getTarget().Options.GuaranteedTailCallOpt && + I->getOpcode() == PPC::ADJCALLSTACKUP) { + // Add (actually subtract) back the amount the callee popped on return. + if (int CalleeAmt = I->getOperand(1).getImm()) { + bool is64Bit = Subtarget.isPPC64(); + CalleeAmt *= -1; + unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1; + unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0; + unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI; + unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4; + unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS; + unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI; + MachineInstr *MI = I; + DebugLoc dl = MI->getDebugLoc(); + + if (isInt<16>(CalleeAmt)) { + BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg) + .addReg(StackReg, RegState::Kill) + .addImm(CalleeAmt); + } else { + MachineBasicBlock::iterator MBBI = I; + BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg) + .addImm(CalleeAmt >> 16); + BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg) + .addReg(TmpReg, RegState::Kill) + .addImm(CalleeAmt & 0xFFFF); + BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg) + .addReg(StackReg, RegState::Kill) + .addReg(TmpReg); + } + } + } + // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions. + MBB.erase(I); +} + +bool +PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + + // Currently, this function only handles SVR4 32- and 64-bit ABIs. + // Return false otherwise to maintain pre-existing behavior. + if (!Subtarget.isSVR4ABI()) + return false; + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + bool CR2Spilled = false; + bool CR3Spilled = false; + bool CR4Spilled = false; + unsigned CSIIndex = 0; + + // Initialize insertion-point logic; we will be restoring in reverse + // order of spill. + MachineBasicBlock::iterator I = MI, BeforeI = I; + bool AtStart = I == MBB.begin(); + + if (!AtStart) + --BeforeI; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + // Only Darwin actually uses the VRSAVE register, but it can still appear + // here if, for example, @llvm.eh.unwind.init() is used. If we're not on + // Darwin, ignore it. + if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) + continue; + + if (Reg == PPC::CR2) { + CR2Spilled = true; + // The spill slot is associated only with CR2, which is the + // first nonvolatile spilled. Save it here. + CSIIndex = i; + continue; + } else if (Reg == PPC::CR3) { + CR3Spilled = true; + continue; + } else if (Reg == PPC::CR4) { + CR4Spilled = true; + continue; + } else { + // When we first encounter a non-CR register after seeing at + // least one CR register, restore all spilled CRs together. + if ((CR2Spilled || CR3Spilled || CR4Spilled) + && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) { + bool is31 = needsFP(*MF); + restoreCRs(Subtarget.isPPC64(), is31, + CR2Spilled, CR3Spilled, CR4Spilled, + MBB, I, CSI, CSIIndex); + CR2Spilled = CR3Spilled = CR4Spilled = false; + } + + // Default behavior for non-CR saves. + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), + RC, TRI); + assert(I != MBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); + } + + // Insert in reverse order. + if (AtStart) + I = MBB.begin(); + else { + I = BeforeI; + ++I; + } + } + + // If we haven't yet spilled the CRs, do so now. + if (CR2Spilled || CR3Spilled || CR4Spilled) { + bool is31 = needsFP(*MF); + restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled, + MBB, I, CSI, CSIIndex); + } + + return true; +} + +bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() && + MF.getSubtarget<PPCSubtarget>().isPPC64()); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h new file mode 100644 index 0000000..bbe1329 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -0,0 +1,129 @@ +//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H +#define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H + +#include "PPC.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class PPCSubtarget; + +class PPCFrameLowering: public TargetFrameLowering { + const PPCSubtarget &Subtarget; + const unsigned ReturnSaveOffset; + const unsigned TOCSaveOffset; + const unsigned FramePointerSaveOffset; + const unsigned LinkageSize; + const unsigned BasePointerSaveOffset; + + /** + * \brief Find a register that can be used in function prologue and epilogue + * + * Find a register that can be use as the scratch register in function + * prologue and epilogue to save various registers (Link Register, Base + * Pointer, etc.). Prefer R0, if it is available. If it is not available, + * then choose a different register. + * + * This method will return true if an available register was found (including + * R0). If no available registers are found, the method returns false and sets + * ScratchRegister to R0, as per the recommendation in the ABI. + * + * \param[in] MBB The machine basic block to find an available register for + * \param[in] UseAtEnd Specify whether the scratch register will be used at + * the end of the basic block (i.e., will the scratch + * register kill a register defined in the basic block) + * \param[out] ScratchRegister The scratch register to use + * \return true if a scratch register was found. false of a scratch register + * was not found and R0 is being used as the default. + */ + bool findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const; + +public: + PPCFrameLowering(const PPCSubtarget &STI); + + unsigned determineFrameLayout(MachineFunction &MF, + bool UpdateMF = true, + bool UseEstimate = false) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + bool hasFP(const MachineFunction &MF) const override; + bool needsFP(const MachineFunction &MF) const; + void replaceFPWithRealFP(MachineFunction &MF) const; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS = nullptr) const override; + void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + bool targetHandlesStackFrameRounding() const override { return true; } + + /// getReturnSaveOffset - Return the previous frame offset to save the + /// return address. + unsigned getReturnSaveOffset() const { return ReturnSaveOffset; } + + /// getTOCSaveOffset - Return the previous frame offset to save the + /// TOC register -- 64-bit SVR4 ABI only. + unsigned getTOCSaveOffset() const { return TOCSaveOffset; } + + /// getFramePointerSaveOffset - Return the previous frame offset to save the + /// frame pointer. + unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; } + + /// getBasePointerSaveOffset - Return the previous frame offset to save the + /// base pointer. + unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; } + + /// getLinkageSize - Return the size of the PowerPC ABI linkage area. + /// + unsigned getLinkageSize() const { return LinkageSize; } + + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + + bool enableShrinkWrapping(const MachineFunction &MF) const override; + + /// Methods used by shrink wrapping to determine if MBB can be used for the + /// function prologue/epilogue. + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp new file mode 100644 index 0000000..7234e30 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -0,0 +1,433 @@ +//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#include "PPCHazardRecognizers.h" +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "pre-RA-sched" + +bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) { + // FIXME: Move this. + if (isBCTRAfterSet(SU)) + return true; + + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->mayLoad()) + return false; + + // SU is a load; for any predecessors in this dispatch group, that are stores, + // and with which we have an ordering dependency, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || !PredMCID->mayStore()) + continue; + + if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->isBranch()) + return false; + + // SU is a branch; for any predecessors in this dispatch group, with which we + // have a data dependence and set the counter register, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR) + continue; + + if (SU->Preds[i].isCtrl()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +// FIXME: Remove this when we don't need this: +namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } } + +// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific. + +bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID, + unsigned &NSlots) { + // FIXME: Indirectly, this information is contained in the itinerary, and + // we should derive it from there instead of separately specifying it + // here. + unsigned IIC = MCID->getSchedClass(); + switch (IIC) { + default: + NSlots = 1; + break; + case PPC::Sched::IIC_IntDivW: + case PPC::Sched::IIC_IntDivD: + case PPC::Sched::IIC_LdStLoadUpd: + case PPC::Sched::IIC_LdStLDU: + case PPC::Sched::IIC_LdStLFDU: + case PPC::Sched::IIC_LdStLFDUX: + case PPC::Sched::IIC_LdStLHA: + case PPC::Sched::IIC_LdStLHAU: + case PPC::Sched::IIC_LdStLWA: + case PPC::Sched::IIC_LdStSTDU: + case PPC::Sched::IIC_LdStSTFDU: + NSlots = 2; + break; + case PPC::Sched::IIC_LdStLoadUpdX: + case PPC::Sched::IIC_LdStLDUX: + case PPC::Sched::IIC_LdStLHAUX: + case PPC::Sched::IIC_LdStLWARX: + case PPC::Sched::IIC_LdStLDARX: + case PPC::Sched::IIC_LdStSTDUX: + case PPC::Sched::IIC_LdStSTDCX: + case PPC::Sched::IIC_LdStSTWCX: + case PPC::Sched::IIC_BrMCRX: // mtcr + // FIXME: Add sync/isync (here and in the itinerary). + NSlots = 4; + break; + } + + // FIXME: record-form instructions need a different itinerary class. + if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1) + NSlots = 2; + + switch (IIC) { + default: + // All multi-slot instructions must come first. + return NSlots > 1; + case PPC::Sched::IIC_BrCR: // cr logicals + case PPC::Sched::IIC_SprMFCR: + case PPC::Sched::IIC_SprMFCRF: + case PPC::Sched::IIC_SprMTSPR: + return true; + } +} + +ScheduleHazardRecognizer::HazardType +PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + if (Stalls == 0 && isLoadAfterStore(SU)) + return NoopHazard; + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + unsigned NSlots; + if (MCID && mustComeFirst(MCID, NSlots) && CurSlots) + return true; + + return ScoreboardHazardRecognizer::ShouldPreferAnother(SU); +} + +unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { + // We only need to fill out a maximum of 5 slots here: The 6th slot could + // only be a second branch, and otherwise the next instruction will start a + // new group. + if (isLoadAfterStore(SU) && CurSlots < 6) { + unsigned Directive = + DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective(); + // If we're using a special group-terminating nop, then we need only one. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 ) + return 1; + + return 5 - CurSlots; + } + + return ScoreboardHazardRecognizer::PreEmitNoops(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (MCID) { + if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + DEBUG(dbgs() << "**** Adding to dispatch group: SU(" << + SU->NodeNum << "): "); + DEBUG(DAG->dumpNode(SU)); + + unsigned NSlots; + bool MustBeFirst = mustComeFirst(MCID, NSlots); + + // If this instruction must come first, but does not, then it starts a + // new group. + if (MustBeFirst && CurSlots) { + CurSlots = CurBranches = 0; + CurGroup.clear(); + } + + CurSlots += NSlots; + CurGroup.push_back(SU); + + if (MCID->isBranch()) + ++CurBranches; + } + } + + return ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() { + return ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() { + llvm_unreachable("Bottom-up scheduling not supported"); +} + +void PPCDispatchGroupSBHazardRecognizer::Reset() { + CurGroup.clear(); + CurSlots = CurBranches = 0; + return ScoreboardHazardRecognizer::Reset(); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { + unsigned Directive = + DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective(); + // If the group has now filled all of its slots, or if we're using a special + // group-terminating nop, the group is complete. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 || CurSlots == 6) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + CurGroup.push_back(nullptr); + ++CurSlots; + } +} + +//===----------------------------------------------------------------------===// +// PowerPC 970 Hazard Recognizer +// +// This models the dispatch group formation of the PPC970 processor. Dispatch +// groups are bundles of up to five instructions that can contain various mixes +// of instructions. The PPC970 can dispatch a peak of 4 non-branch and one +// branch instruction per-cycle. +// +// There are a number of restrictions to dispatch group formation: some +// instructions can only be issued in the first slot of a dispatch group, & some +// instructions fill an entire dispatch group. Additionally, only branches can +// issue in the 5th (last) slot. +// +// Finally, there are a number of "structural" hazards on the PPC970. These +// conditions cause large performance penalties due to misprediction, recovery, +// and replay logic that has to happen. These cases include setting a CTR and +// branching through it in the same dispatch group, and storing to an address, +// then loading from the same address within a dispatch group. To avoid these +// conditions, we insert no-op instructions when appropriate. +// +// FIXME: This is missing some significant cases: +// 1. Modeling of microcoded instructions. +// 2. Handling of serialized operations. +// 3. Handling of the esoteric cases in "Resource-based Instruction Grouping". +// + +PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG) + : DAG(DAG) { + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::EndDispatchGroup() { + DEBUG(errs() << "=== Start of dispatch group\n"); + NumIssued = 0; + + // Structural hazard info. + HasCTRSet = false; + NumStores = 0; +} + + +PPCII::PPC970_Unit +PPCHazardRecognizer970::GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle, + bool &isCracked, + bool &isLoad, bool &isStore) { + const MCInstrDesc &MCID = DAG.TII->get(Opcode); + + isLoad = MCID.mayLoad(); + isStore = MCID.mayStore(); + + uint64_t TSFlags = MCID.TSFlags; + + isFirst = TSFlags & PPCII::PPC970_First; + isSingle = TSFlags & PPCII::PPC970_Single; + isCracked = TSFlags & PPCII::PPC970_Cracked; + return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask); +} + +/// isLoadOfStoredAddress - If we have a load from the previously stored pointer +/// as indicated by StorePtr1/StorePtr2/StoreSize, return true. +bool PPCHazardRecognizer970:: +isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset, + const Value *LoadValue) const { + for (unsigned i = 0, e = NumStores; i != e; ++i) { + // Handle exact and commuted addresses. + if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i]) + return true; + + // Okay, we don't have an exact match, if this is an indexed offset, see if + // we have overlap (which happens during fp->int conversion for example). + if (StoreValue[i] == LoadValue) { + // Okay the base pointers match, so we have [c1+r] vs [c2+r]. Check + // to see if the load and store actually overlap. + if (StoreOffset[i] < LoadOffset) { + if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true; + } else { + if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true; + } + } + } + return false; +} + +/// getHazardType - We return hazard for any non-branch instruction that would +/// terminate the dispatch group. We turn NoopHazard for any +/// instructions that wouldn't terminate the dispatch group that would cause a +/// pipeline flush. +ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970:: +getHazardType(SUnit *SU, int Stalls) { + assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead"); + + MachineInstr *MI = SU->getInstr(); + + if (MI->isDebugValue()) + return NoHazard; + + unsigned Opcode = MI->getOpcode(); + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Opcode, isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return NoHazard; + + // We can only issue a PPC970_First/PPC970_Single instruction (such as + // crand/mtspr/etc) if this is the first cycle of the dispatch group. + if (NumIssued != 0 && (isFirst || isSingle)) + return Hazard; + + // If this instruction is cracked into two ops by the decoder, we know that + // it is not a branch and that it cannot issue if 3 other instructions are + // already in the dispatch group. + if (isCracked && NumIssued > 2) + return Hazard; + + switch (InstrType) { + default: llvm_unreachable("Unknown instruction type!"); + case PPCII::PPC970_FXU: + case PPCII::PPC970_LSU: + case PPCII::PPC970_FPU: + case PPCII::PPC970_VALU: + case PPCII::PPC970_VPERM: + // We can only issue a branch as the last instruction in a group. + if (NumIssued == 4) return Hazard; + break; + case PPCII::PPC970_CRU: + // We can only issue a CR instruction in the first two slots. + if (NumIssued >= 2) return Hazard; + break; + case PPCII::PPC970_BRU: + break; + } + + // Do not allow MTCTR and BCTRL to be in the same dispatch group. + if (HasCTRSet && Opcode == PPC::BCTRL) + return NoopHazard; + + // If this is a load following a store, make sure it's not to the same or + // overlapping address. + if (isLoad && NumStores && !MI->memoperands_empty()) { + MachineMemOperand *MO = *MI->memoperands_begin(); + if (isLoadOfStoredAddress(MO->getSize(), + MO->getOffset(), MO->getValue())) + return NoopHazard; + } + + return NoHazard; +} + +void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + + if (MI->isDebugValue()) + return; + + unsigned Opcode = MI->getOpcode(); + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Opcode, isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return; + + // Update structural hazard information. + if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true; + + // Track the address stored to. + if (isStore && NumStores < 4 && !MI->memoperands_empty()) { + MachineMemOperand *MO = *MI->memoperands_begin(); + StoreSize[NumStores] = MO->getSize(); + StoreOffset[NumStores] = MO->getOffset(); + StoreValue[NumStores] = MO->getValue(); + ++NumStores; + } + + if (InstrType == PPCII::PPC970_BRU || isSingle) + NumIssued = 4; // Terminate a d-group. + ++NumIssued; + + // If this instruction is cracked into two ops by the decoder, remember that + // we issued two pieces. + if (isCracked) + ++NumIssued; + + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::AdvanceCycle() { + assert(NumIssued < 5 && "Illegal dispatch group!"); + ++NumIssued; + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::Reset() { + EndDispatchGroup(); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h new file mode 100644 index 0000000..4b50214 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -0,0 +1,102 @@ +//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H +#define LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H + +#include "PPCInstrInfo.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +namespace llvm { + +/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based +/// hazard recognizer for PPC ooo processors with dispatch-group hazards. +class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer { + const ScheduleDAG *DAG; + SmallVector<SUnit *, 7> CurGroup; + unsigned CurSlots, CurBranches; + + bool isLoadAfterStore(SUnit *SU); + bool isBCTRAfterSet(SUnit *SU); + bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots); +public: + PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData, + const ScheduleDAG *DAG_) : + ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_), + CurSlots(0), CurBranches(0) {} + + HazardType getHazardType(SUnit *SU, int Stalls) override; + bool ShouldPreferAnother(SUnit* SU) override; + unsigned PreEmitNoops(SUnit *SU) override; + void EmitInstruction(SUnit *SU) override; + void AdvanceCycle() override; + void RecedeCycle() override; + void Reset() override; + void EmitNoop() override; +}; + +/// PPCHazardRecognizer970 - This class defines a finite state automata that +/// models the dispatch logic on the PowerPC 970 (aka G5) processor. This +/// promotes good dispatch group formation and implements noop insertion to +/// avoid structural hazards that cause significant performance penalties (e.g. +/// setting the CTR register then branching through it within a dispatch group), +/// or storing then loading from the same address within a dispatch group. +class PPCHazardRecognizer970 : public ScheduleHazardRecognizer { + const ScheduleDAG &DAG; + + unsigned NumIssued; // Number of insts issued, including advanced cycles. + + // Various things that can cause a structural hazard. + + // HasCTRSet - If the CTR register is set in this group, disallow BCTRL. + bool HasCTRSet; + + // StoredPtr - Keep track of the address of any store. If we see a load from + // the same address (or one that aliases it), disallow the store. We can have + // up to four stores in one dispatch group, hence we track up to 4. + // + // This is null if we haven't seen a store yet. We keep track of both + // operands of the store here, since we support [r+r] and [r+i] addressing. + const Value *StoreValue[4]; + int64_t StoreOffset[4]; + uint64_t StoreSize[4]; + unsigned NumStores; + +public: + PPCHazardRecognizer970(const ScheduleDAG &DAG); + HazardType getHazardType(SUnit *SU, int Stalls) override; + void EmitInstruction(SUnit *SU) override; + void AdvanceCycle() override; + void Reset() override; + +private: + /// EndDispatchGroup - Called when we are finishing a new dispatch group. + /// + void EndDispatchGroup(); + + /// GetInstrType - Classify the specified powerpc opcode according to its + /// pipeline. + PPCII::PPC970_Unit GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle,bool &isCracked, + bool &isLoad, bool &isStore); + + bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset, + const Value *LoadValue) const; +}; + +} // end namespace llvm + +#endif + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp new file mode 100644 index 0000000..1eaa811 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -0,0 +1,4379 @@ +//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for PowerPC, +// converting from a legalized dag to a PPC dag. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-codegen" + +// FIXME: Remove this once the bug has been fixed! +cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", +cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); + +static cl::opt<bool> + UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true), + cl::desc("use aggressive ppc isel for bit permutations"), + cl::Hidden); +static cl::opt<bool> BPermRewriterNoMasking( + "ppc-bit-perm-rewriter-stress-rotates", + cl::desc("stress rotate selection in aggressive ppc isel for " + "bit permutations"), + cl::Hidden); + +static cl::opt<bool> EnableBranchHint( + "ppc-use-branch-hint", cl::init(true), + cl::desc("Enable static hinting of branches on ppc"), + cl::Hidden); + +namespace llvm { + void initializePPCDAGToDAGISelPass(PassRegistry&); +} + +namespace { + //===--------------------------------------------------------------------===// + /// PPCDAGToDAGISel - PPC specific code to select PPC machine + /// instructions for SelectionDAG operations. + /// + class PPCDAGToDAGISel : public SelectionDAGISel { + const PPCTargetMachine &TM; + const PPCSubtarget *PPCSubTarget; + const PPCTargetLowering *PPCLowering; + unsigned GlobalBaseReg; + public: + explicit PPCDAGToDAGISel(PPCTargetMachine &tm) + : SelectionDAGISel(tm), TM(tm) { + initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + // Make sure we re-emit a set of the global base reg if necessary + GlobalBaseReg = 0; + PPCSubTarget = &MF.getSubtarget<PPCSubtarget>(); + PPCLowering = PPCSubTarget->getTargetLowering(); + SelectionDAGISel::runOnMachineFunction(MF); + + if (!PPCSubTarget->isSVR4ABI()) + InsertVRSaveCode(MF); + + return true; + } + + void PreprocessISelDAG() override; + void PostprocessISelDAG() override; + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDValue getI32Imm(unsigned Imm, SDLoc dl) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + } + + /// getI64Imm - Return a target constant with the specified value, of type + /// i64. + inline SDValue getI64Imm(uint64_t Imm, SDLoc dl) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i64); + } + + /// getSmallIPtrImm - Return a target constant of pointer type. + inline SDValue getSmallIPtrImm(unsigned Imm, SDLoc dl) { + return CurDAG->getTargetConstant( + Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout())); + } + + /// isRotateAndMask - Returns true if Mask and Shift can be folded into a + /// rotate and mask opcode and mask operation. + static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask, + unsigned &SH, unsigned &MB, unsigned &ME); + + /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC + /// base register. Return the virtual register that holds this value. + SDNode *getGlobalBaseReg(); + + SDNode *getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0); + + // Select - Convert the specified operand from a target-independent to a + // target-specific node if it hasn't already been changed. + SDNode *Select(SDNode *N) override; + + SDNode *SelectBitfieldInsert(SDNode *N); + SDNode *SelectBitPermutation(SDNode *N); + + /// SelectCC - Select a comparison of the specified values with the + /// specified condition code, returning the CR# of the expression. + SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl); + + /// SelectAddrImm - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement [r+imm]. + bool SelectAddrImm(SDValue N, SDValue &Disp, + SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); + } + + /// SelectAddrImmOffs - Return true if the operand is valid for a preinc + /// immediate field. Note that the operand at this point is already the + /// result of a prior SelectAddressRegImm call. + bool SelectAddrImmOffs(SDValue N, SDValue &Out) const { + if (N.getOpcode() == ISD::TargetConstant || + N.getOpcode() == ISD::TargetGlobalAddress) { + Out = N; + return true; + } + + return false; + } + + /// SelectAddrIdx - Given the specified addressed, check to see if it can be + /// represented as an indexed [r+r] operation. Returns false if it can + /// be represented by [r+imm], which are preferred. + bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG); + } + + /// SelectAddrIdxOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG); + } + + /// SelectAddrImmX4 - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement that is a multiple of 4. + /// Suitable for use by STD and friends. + bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); + } + + // Select an address into a single register. + bool SelectAddr(SDValue N, SDValue &Base) { + Base = N; + return true; + } + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. It is always correct to compute the value into + /// a register. The case of adding a (possibly relocatable) constant to a + /// register can be improved, but it is wrong to substitute Reg+Reg for + /// Reg in an asm, because the load or store opcode would have to change. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + unsigned ConstraintID, + std::vector<SDValue> &OutOps) override { + + switch(ConstraintID) { + default: + errs() << "ConstraintID: " << ConstraintID << "\n"; + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_es: + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: + case InlineAsm::Constraint_Q: + case InlineAsm::Constraint_Z: + case InlineAsm::Constraint_Zy: + // We need to make sure that this one operand does not end up in r0 + // (because we might end up lowering this as 0(%op)). + const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); + const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1); + SDLoc dl(Op); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); + SDValue NewOp = + SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op.getValueType(), + Op, RC), 0); + + OutOps.push_back(NewOp); + return false; + } + return true; + } + + void InsertVRSaveCode(MachineFunction &MF); + + const char *getPassName() const override { + return "PowerPC DAG->DAG Pattern Instruction Selection"; + } + +// Include the pieces autogenerated from the target description. +#include "PPCGenDAGISel.inc" + +private: + SDNode *SelectSETCC(SDNode *N); + + void PeepholePPC64(); + void PeepholePPC64ZExt(); + void PeepholeCROps(); + + SDValue combineToCMPB(SDNode *N); + void foldBoolExts(SDValue &Res, SDNode *&N); + + bool AllUsersSelectZero(SDNode *N); + void SwapAllSelectUsers(SDNode *N); + + SDNode *transferMemOperands(SDNode *N, SDNode *Result); + }; +} + +/// InsertVRSaveCode - Once the entire function has been instruction selected, +/// all virtual registers are created and all machine instructions are built, +/// check to see if we need to save/restore VRSAVE. If so, do it. +void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { + // Check to see if this function uses vector registers, which means we have to + // save and restore the VRSAVE register and update it with the regs we use. + // + // In this case, there will be virtual registers of vector type created + // by the scheduler. Detect them now. + bool HasVectorVReg = false; + for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) { + HasVectorVReg = true; + break; + } + } + if (!HasVectorVReg) return; // nothing to do. + + // If we have a vector register, we want to emit code into the entry and exit + // blocks to save and restore the VRSAVE register. We do this here (instead + // of marking all vector instructions as clobbering VRSAVE) for two reasons: + // + // 1. This (trivially) reduces the load on the register allocator, by not + // having to represent the live range of the VRSAVE register. + // 2. This (more significantly) allows us to create a temporary virtual + // register to hold the saved VRSAVE value, allowing this temporary to be + // register allocated, instead of forcing it to be spilled to the stack. + + // Create two vregs - one to hold the VRSAVE register that is live-in to the + // function and one for the value after having bits or'd into it. + unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + + const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo(); + MachineBasicBlock &EntryBB = *Fn.begin(); + DebugLoc dl; + // Emit the following code into the entry block: + // InVRSAVE = MFVRSAVE + // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE + // MTVRSAVE UpdatedVRSAVE + MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point + BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE); + BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE), + UpdatedVRSAVE).addReg(InVRSAVE); + BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE); + + // Find all return blocks, outputting a restore in each epilog. + for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { + if (BB->isReturnBlock()) { + IP = BB->end(); --IP; + + // Skip over all terminator instructions, which are part of the return + // sequence. + MachineBasicBlock::iterator I2 = IP; + while (I2 != BB->begin() && (--I2)->isTerminator()) + IP = I2; + + // Emit: MTVRSAVE InVRSave + BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE); + } + } +} + + +/// getGlobalBaseReg - Output the instructions required to put the +/// base address to use for accessing globals into a register. +/// +SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { + if (!GlobalBaseReg) { + const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo(); + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = MF->front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + const Module *M = MF->getFunction()->getParent(); + DebugLoc dl; + + if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) { + if (PPCSubTarget->isTargetELF()) { + GlobalBaseReg = PPC::R30; + if (M->getPICLevel() == PICLevel::Small) { + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); + MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true); + } else { + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); + unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + BuildMI(FirstMBB, MBBI, dl, + TII.get(PPC::UpdateGBR), GlobalBaseReg) + .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg); + MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true); + } + } else { + GlobalBaseReg = + RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); + } + } else { + GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg); + } + } + return CurDAG->getRegister(GlobalBaseReg, + PPCLowering->getPointerTy(CurDAG->getDataLayout())) + .getNode(); +} + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (N->getOpcode() != ISD::Constant) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +} + +static bool isIntS16Immediate(SDValue Op, short &Imm) { + return isIntS16Immediate(Op.getNode(), Imm); +} + + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +/// isInt64Immediate - This method tests to see if the node is a 64-bit constant +/// operand. If so Imm will receive the 64-bit value. +static bool isInt64Immediate(SDNode *N, uint64_t &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + +static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, + const SDValue &DestMBB) { + assert(isa<BasicBlockSDNode>(DestMBB)); + + if (!FuncInfo->BPI) return PPC::BR_NO_HINT; + + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const TerminatorInst *BBTerm = BB->getTerminator(); + + if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; + + const BasicBlock *TBB = BBTerm->getSuccessor(0); + const BasicBlock *FBB = BBTerm->getSuccessor(1); + + auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB); + auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB); + + // We only want to handle cases which are easy to predict at static time, e.g. + // C++ throw statement, that is very likely not taken, or calling never + // returned function, e.g. stdlib exit(). So we set Threshold to filter + // unwanted cases. + // + // Below is LLVM branch weight table, we only want to handle case 1, 2 + // + // Case Taken:Nontaken Example + // 1. Unreachable 1048575:1 C++ throw, stdlib exit(), + // 2. Invoke-terminating 1:1048575 + // 3. Coldblock 4:64 __builtin_expect + // 4. Loop Branch 124:4 For loop + // 5. PH/ZH/FPH 20:12 + const uint32_t Threshold = 10000; + + if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb)) + return PPC::BR_NO_HINT; + + DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::" + << BB->getName() << "'\n" + << " -> " << TBB->getName() << ": " << TProb << "\n" + << " -> " << FBB->getName() << ": " << FProb << "\n"); + + const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB); + + // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities, + // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock + if (BBDN->getBasicBlock()->getBasicBlock() != TBB) + std::swap(TProb, FProb); + + return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT; +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc + && isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + +SDNode *PPCDAGToDAGISel::getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) { + SDLoc dl(SN); + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0)); + unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8; + if (SN->hasOneUse()) + return CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI, + getSmallIPtrImm(Offset, dl)); + return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI, + getSmallIPtrImm(Offset, dl)); +} + +bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, + bool isShiftMask, unsigned &SH, + unsigned &MB, unsigned &ME) { + // Don't even go down this path for i64, since different logic will be + // necessary for rldicl/rldicr/rldimi. + if (N->getValueType(0) != MVT::i32) + return false; + + unsigned Shift = 32; + unsigned Indeterminant = ~0; // bit mask marking indeterminant results + unsigned Opcode = N->getOpcode(); + if (N->getNumOperands() != 2 || + !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31)) + return false; + + if (Opcode == ISD::SHL) { + // apply shift left to mask if it comes first + if (isShiftMask) Mask = Mask << Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu << Shift); + } else if (Opcode == ISD::SRL) { + // apply shift right to mask if it comes first + if (isShiftMask) Mask = Mask >> Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu >> Shift); + // adjust for the left rotate + Shift = 32 - Shift; + } else if (Opcode == ISD::ROTL) { + Indeterminant = 0; + } else { + return false; + } + + // if the mask doesn't intersect any Indeterminant bits + if (Mask && !(Mask & Indeterminant)) { + SH = Shift & 31; + // make sure the mask is still a mask (wrap arounds may not be) + return isRunOfOnes(Mask, MB, ME); + } + return false; +} + +/// SelectBitfieldInsert - turn an or of two masked values into +/// the rotate left word immediate then mask insert (rlwimi) instruction. +SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc dl(N); + + APInt LKZ, LKO, RKZ, RKO; + CurDAG->computeKnownBits(Op0, LKZ, LKO); + CurDAG->computeKnownBits(Op1, RKZ, RKO); + + unsigned TargetMask = LKZ.getZExtValue(); + unsigned InsertMask = RKZ.getZExtValue(); + + if ((TargetMask | InsertMask) == 0xFFFFFFFF) { + unsigned Op0Opc = Op0.getOpcode(); + unsigned Op1Opc = Op1.getOpcode(); + unsigned Value, SH = 0; + TargetMask = ~TargetMask; + InsertMask = ~InsertMask; + + // If the LHS has a foldable shift and the RHS does not, then swap it to the + // RHS so that we can fold the shift into the insert. + if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) { + if (Op0.getOperand(0).getOpcode() == ISD::SHL || + Op0.getOperand(0).getOpcode() == ISD::SRL) { + if (Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) { + if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + + unsigned MB, ME; + if (isRunOfOnes(InsertMask, MB, ME)) { + SDValue Tmp1, Tmp2; + + if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) && + isInt32Immediate(Op1.getOperand(1), Value)) { + Op1 = Op1.getOperand(0); + SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value; + } + if (Op1Opc == ISD::AND) { + // The AND mask might not be a constant, and we need to make sure that + // if we're going to fold the masking with the insert, all bits not + // know to be zero in the mask are known to be one. + APInt MKZ, MKO; + CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO); + bool CanFoldMask = InsertMask == MKO.getZExtValue(); + + unsigned SHOpc = Op1.getOperand(0).getOpcode(); + if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask && + isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) { + // Note that Value must be in range here (less than 32) because + // otherwise there would not be any bits set in InsertMask. + Op1 = Op1.getOperand(0).getOperand(0); + SH = (SHOpc == ISD::SHL) ? Value : 32 - Value; + } + } + + SH &= 31; + SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops); + } + } + return nullptr; +} + +// Predict the number of instructions that would be generated by calling +// SelectInt64(N). +static unsigned SelectInt64CountDirect(int64_t Imm) { + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros<uint64_t>(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt<32>(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + unsigned Result = 0; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + + // Simple value. + if (isInt<16>(Imm)) { + // Just the Lo bits. + ++Result; + } else if (Lo) { + // Handle the Hi bits and Lo bits. + Result += 2; + } else { + // Just the Hi bits. + ++Result; + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) + ++Result; + + // Add in the last bits as required. + if ((Remainder >> 16) & 0xFFFF) + ++Result; + if (Remainder & 0xFFFF) + ++Result; + + return Result; +} + +static uint64_t Rot64(uint64_t Imm, unsigned R) { + return (Imm << R) | (Imm >> (64 - R)); +} + +static unsigned SelectInt64Count(int64_t Imm) { + unsigned Count = SelectInt64CountDirect(Imm); + if (Count == 1) + return Count; + + for (unsigned r = 1; r < 63; ++r) { + uint64_t RImm = Rot64(Imm, r); + unsigned RCount = SelectInt64CountDirect(RImm) + 1; + Count = std::min(Count, RCount); + + // See comments in SelectInt64 for an explanation of the logic below. + unsigned LS = findLastSet(RImm); + if (LS != r-1) + continue; + + uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); + uint64_t RImmWithOnes = RImm | OnesMask; + + RCount = SelectInt64CountDirect(RImmWithOnes) + 1; + Count = std::min(Count, RCount); + } + + return Count; +} + +// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count +// (above) needs to be kept in sync with this function. +static SDNode *SelectInt64Direct(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) { + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros<uint64_t>(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt<32>(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + SDNode *Result; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + auto getI32Imm = [CurDAG, dl](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + }; + + // Simple value. + if (isInt<16>(Imm)) { + // Just the Lo bits. + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); + } else if (Lo) { + // Handle the Hi bits. + unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); + // And Lo bits. + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } else { + // Just the Hi bits. + Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) { + Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + SDValue(Result, 0), + getI32Imm(Shift), + getI32Imm(63 - Shift)); + } + + // Add in the last bits as required. + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Hi)); + } + if ((Lo = Remainder & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } + + return Result; +} + +static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) { + unsigned Count = SelectInt64CountDirect(Imm); + if (Count == 1) + return SelectInt64Direct(CurDAG, dl, Imm); + + unsigned RMin = 0; + + int64_t MatImm; + unsigned MaskEnd; + + for (unsigned r = 1; r < 63; ++r) { + uint64_t RImm = Rot64(Imm, r); + unsigned RCount = SelectInt64CountDirect(RImm) + 1; + if (RCount < Count) { + Count = RCount; + RMin = r; + MatImm = RImm; + MaskEnd = 63; + } + + // If the immediate to generate has many trailing zeros, it might be + // worthwhile to generate a rotated value with too many leading ones + // (because that's free with li/lis's sign-extension semantics), and then + // mask them off after rotation. + + unsigned LS = findLastSet(RImm); + // We're adding (63-LS) higher-order ones, and we expect to mask them off + // after performing the inverse rotation by (64-r). So we need that: + // 63-LS == 64-r => LS == r-1 + if (LS != r-1) + continue; + + uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); + uint64_t RImmWithOnes = RImm | OnesMask; + + RCount = SelectInt64CountDirect(RImmWithOnes) + 1; + if (RCount < Count) { + Count = RCount; + RMin = r; + MatImm = RImmWithOnes; + MaskEnd = LS; + } + } + + if (!RMin) + return SelectInt64Direct(CurDAG, dl, Imm); + + auto getI32Imm = [CurDAG, dl](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + }; + + SDValue Val = SDValue(SelectInt64Direct(CurDAG, dl, MatImm), 0); + return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val, + getI32Imm(64 - RMin), getI32Imm(MaskEnd)); +} + +// Select a 64-bit constant. +static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) { + SDLoc dl(N); + + // Get 64 bit value. + int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return SelectInt64(CurDAG, dl, Imm); +} + +namespace { +class BitPermutationSelector { + struct ValueBit { + SDValue V; + + // The bit number in the value, using a convention where bit 0 is the + // lowest-order bit. + unsigned Idx; + + enum Kind { + ConstZero, + Variable + } K; + + ValueBit(SDValue V, unsigned I, Kind K = Variable) + : V(V), Idx(I), K(K) {} + ValueBit(Kind K = Variable) + : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {} + + bool isZero() const { + return K == ConstZero; + } + + bool hasValue() const { + return K == Variable; + } + + SDValue getValue() const { + assert(hasValue() && "Cannot get the value of a constant bit"); + return V; + } + + unsigned getValueBitIndex() const { + assert(hasValue() && "Cannot get the value bit index of a constant bit"); + return Idx; + } + }; + + // A bit group has the same underlying value and the same rotate factor. + struct BitGroup { + SDValue V; + unsigned RLAmt; + unsigned StartIdx, EndIdx; + + // This rotation amount assumes that the lower 32 bits of the quantity are + // replicated in the high 32 bits by the rotation operator (which is done + // by rlwinm and friends in 64-bit mode). + bool Repl32; + // Did converting to Repl32 == true change the rotation factor? If it did, + // it decreased it by 32. + bool Repl32CR; + // Was this group coalesced after setting Repl32 to true? + bool Repl32Coalesced; + + BitGroup(SDValue V, unsigned R, unsigned S, unsigned E) + : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false), + Repl32Coalesced(false) { + DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R << + " [" << S << ", " << E << "]\n"); + } + }; + + // Information on each (Value, RLAmt) pair (like the number of groups + // associated with each) used to choose the lowering method. + struct ValueRotInfo { + SDValue V; + unsigned RLAmt; + unsigned NumGroups; + unsigned FirstGroupStartIdx; + bool Repl32; + + ValueRotInfo() + : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX), + Repl32(false) {} + + // For sorting (in reverse order) by NumGroups, and then by + // FirstGroupStartIdx. + bool operator < (const ValueRotInfo &Other) const { + // We need to sort so that the non-Repl32 come first because, when we're + // doing masking, the Repl32 bit groups might be subsumed into the 64-bit + // masking operation. + if (Repl32 < Other.Repl32) + return true; + else if (Repl32 > Other.Repl32) + return false; + else if (NumGroups > Other.NumGroups) + return true; + else if (NumGroups < Other.NumGroups) + return false; + else if (FirstGroupStartIdx < Other.FirstGroupStartIdx) + return true; + return false; + } + }; + + // Return true if something interesting was deduced, return false if we're + // providing only a generic representation of V (or something else likewise + // uninteresting for instruction selection). + bool getValueBits(SDValue V, SmallVector<ValueBit, 64> &Bits) { + switch (V.getOpcode()) { + default: break; + case ISD::ROTL: + if (isa<ConstantSDNode>(V.getOperand(1))) { + unsigned RotAmt = V.getConstantOperandVal(1); + + SmallVector<ValueBit, 64> LHSBits(Bits.size()); + getValueBits(V.getOperand(0), LHSBits); + + for (unsigned i = 0; i < Bits.size(); ++i) + Bits[i] = LHSBits[i < RotAmt ? i + (Bits.size() - RotAmt) : i - RotAmt]; + + return true; + } + break; + case ISD::SHL: + if (isa<ConstantSDNode>(V.getOperand(1))) { + unsigned ShiftAmt = V.getConstantOperandVal(1); + + SmallVector<ValueBit, 64> LHSBits(Bits.size()); + getValueBits(V.getOperand(0), LHSBits); + + for (unsigned i = ShiftAmt; i < Bits.size(); ++i) + Bits[i] = LHSBits[i - ShiftAmt]; + + for (unsigned i = 0; i < ShiftAmt; ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + + return true; + } + break; + case ISD::SRL: + if (isa<ConstantSDNode>(V.getOperand(1))) { + unsigned ShiftAmt = V.getConstantOperandVal(1); + + SmallVector<ValueBit, 64> LHSBits(Bits.size()); + getValueBits(V.getOperand(0), LHSBits); + + for (unsigned i = 0; i < Bits.size() - ShiftAmt; ++i) + Bits[i] = LHSBits[i + ShiftAmt]; + + for (unsigned i = Bits.size() - ShiftAmt; i < Bits.size(); ++i) + Bits[i] = ValueBit(ValueBit::ConstZero); + + return true; + } + break; + case ISD::AND: + if (isa<ConstantSDNode>(V.getOperand(1))) { + uint64_t Mask = V.getConstantOperandVal(1); + + SmallVector<ValueBit, 64> LHSBits(Bits.size()); + bool LHSTrivial = getValueBits(V.getOperand(0), LHSBits); + + for (unsigned i = 0; i < Bits.size(); ++i) + if (((Mask >> i) & 1) == 1) + Bits[i] = LHSBits[i]; + else + Bits[i] = ValueBit(ValueBit::ConstZero); + + // Mark this as interesting, only if the LHS was also interesting. This + // prevents the overall procedure from matching a single immediate 'and' + // (which is non-optimal because such an and might be folded with other + // things if we don't select it here). + return LHSTrivial; + } + break; + case ISD::OR: { + SmallVector<ValueBit, 64> LHSBits(Bits.size()), RHSBits(Bits.size()); + getValueBits(V.getOperand(0), LHSBits); + getValueBits(V.getOperand(1), RHSBits); + + bool AllDisjoint = true; + for (unsigned i = 0; i < Bits.size(); ++i) + if (LHSBits[i].isZero()) + Bits[i] = RHSBits[i]; + else if (RHSBits[i].isZero()) + Bits[i] = LHSBits[i]; + else { + AllDisjoint = false; + break; + } + + if (!AllDisjoint) + break; + + return true; + } + } + + for (unsigned i = 0; i < Bits.size(); ++i) + Bits[i] = ValueBit(V, i); + + return false; + } + + // For each value (except the constant ones), compute the left-rotate amount + // to get it from its original to final position. + void computeRotationAmounts() { + HasZeros = false; + RLAmt.resize(Bits.size()); + for (unsigned i = 0; i < Bits.size(); ++i) + if (Bits[i].hasValue()) { + unsigned VBI = Bits[i].getValueBitIndex(); + if (i >= VBI) + RLAmt[i] = i - VBI; + else + RLAmt[i] = Bits.size() - (VBI - i); + } else if (Bits[i].isZero()) { + HasZeros = true; + RLAmt[i] = UINT32_MAX; + } else { + llvm_unreachable("Unknown value bit type"); + } + } + + // Collect groups of consecutive bits with the same underlying value and + // rotation factor. If we're doing late masking, we ignore zeros, otherwise + // they break up groups. + void collectBitGroups(bool LateMask) { + BitGroups.clear(); + + unsigned LastRLAmt = RLAmt[0]; + SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue(); + unsigned LastGroupStartIdx = 0; + for (unsigned i = 1; i < Bits.size(); ++i) { + unsigned ThisRLAmt = RLAmt[i]; + SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue(); + if (LateMask && !ThisValue) { + ThisValue = LastValue; + ThisRLAmt = LastRLAmt; + // If we're doing late masking, then the first bit group always starts + // at zero (even if the first bits were zero). + if (BitGroups.empty()) + LastGroupStartIdx = 0; + } + + // If this bit has the same underlying value and the same rotate factor as + // the last one, then they're part of the same group. + if (ThisRLAmt == LastRLAmt && ThisValue == LastValue) + continue; + + if (LastValue.getNode()) + BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, + i-1)); + LastRLAmt = ThisRLAmt; + LastValue = ThisValue; + LastGroupStartIdx = i; + } + if (LastValue.getNode()) + BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, + Bits.size()-1)); + + if (BitGroups.empty()) + return; + + // We might be able to combine the first and last groups. + if (BitGroups.size() > 1) { + // If the first and last groups are the same, then remove the first group + // in favor of the last group, making the ending index of the last group + // equal to the ending index of the to-be-removed first group. + if (BitGroups[0].StartIdx == 0 && + BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 && + BitGroups[0].V == BitGroups[BitGroups.size()-1].V && + BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) { + DEBUG(dbgs() << "\tcombining final bit group with initial one\n"); + BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx; + BitGroups.erase(BitGroups.begin()); + } + } + } + + // Take all (SDValue, RLAmt) pairs and sort them by the number of groups + // associated with each. If there is a degeneracy, pick the one that occurs + // first (in the final value). + void collectValueRotInfo() { + ValueRots.clear(); + + for (auto &BG : BitGroups) { + unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0); + ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)]; + VRI.V = BG.V; + VRI.RLAmt = BG.RLAmt; + VRI.Repl32 = BG.Repl32; + VRI.NumGroups += 1; + VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx); + } + + // Now that we've collected the various ValueRotInfo instances, we need to + // sort them. + ValueRotsVec.clear(); + for (auto &I : ValueRots) { + ValueRotsVec.push_back(I.second); + } + std::sort(ValueRotsVec.begin(), ValueRotsVec.end()); + } + + // In 64-bit mode, rlwinm and friends have a rotation operator that + // replicates the low-order 32 bits into the high-order 32-bits. The mask + // indices of these instructions can only be in the lower 32 bits, so they + // can only represent some 64-bit bit groups. However, when they can be used, + // the 32-bit replication can be used to represent, as a single bit group, + // otherwise separate bit groups. We'll convert to replicated-32-bit bit + // groups when possible. Returns true if any of the bit groups were + // converted. + void assignRepl32BitGroups() { + // If we have bits like this: + // + // Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + // V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24 + // Groups: | RLAmt = 8 | RLAmt = 40 | + // + // But, making use of a 32-bit operation that replicates the low-order 32 + // bits into the high-order 32 bits, this can be one bit group with a RLAmt + // of 8. + + auto IsAllLow32 = [this](BitGroup & BG) { + if (BG.StartIdx <= BG.EndIdx) { + for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + } else { + for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + for (unsigned i = 0; i <= BG.EndIdx; ++i) { + if (!Bits[i].hasValue()) + continue; + if (Bits[i].getValueBitIndex() >= 32) + return false; + } + } + + return true; + }; + + for (auto &BG : BitGroups) { + if (BG.StartIdx < 32 && BG.EndIdx < 32) { + if (IsAllLow32(BG)) { + if (BG.RLAmt >= 32) { + BG.RLAmt -= 32; + BG.Repl32CR = true; + } + + BG.Repl32 = true; + + DEBUG(dbgs() << "\t32-bit replicated bit group for " << + BG.V.getNode() << " RLAmt = " << BG.RLAmt << + " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n"); + } + } + } + + // Now walk through the bit groups, consolidating where possible. + for (auto I = BitGroups.begin(); I != BitGroups.end();) { + // We might want to remove this bit group by merging it with the previous + // group (which might be the ending group). + auto IP = (I == BitGroups.begin()) ? + std::prev(BitGroups.end()) : std::prev(I); + if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt && + I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) { + + DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " << + I->V.getNode() << " RLAmt = " << I->RLAmt << + " [" << I->StartIdx << ", " << I->EndIdx << + "] with group with range [" << + IP->StartIdx << ", " << IP->EndIdx << "]\n"); + + IP->EndIdx = I->EndIdx; + IP->Repl32CR = IP->Repl32CR || I->Repl32CR; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + continue; + } else { + // There is a special case worth handling: If there is a single group + // covering the entire upper 32 bits, and it can be merged with both + // the next and previous groups (which might be the same group), then + // do so. If it is the same group (so there will be only one group in + // total), then we need to reverse the order of the range so that it + // covers the entire 64 bits. + if (I->StartIdx == 32 && I->EndIdx == 63) { + assert(std::next(I) == BitGroups.end() && + "bit group ends at index 63 but there is another?"); + auto IN = BitGroups.begin(); + + if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && + (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt && + IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP && + IsAllLow32(*I)) { + + DEBUG(dbgs() << "\tcombining bit group for " << + I->V.getNode() << " RLAmt = " << I->RLAmt << + " [" << I->StartIdx << ", " << I->EndIdx << + "] with 32-bit replicated groups with ranges [" << + IP->StartIdx << ", " << IP->EndIdx << "] and [" << + IN->StartIdx << ", " << IN->EndIdx << "]\n"); + + if (IP == IN) { + // There is only one other group; change it to cover the whole + // range (backward, so that it can still be Repl32 but cover the + // whole 64-bit range). + IP->StartIdx = 31; + IP->EndIdx = 30; + IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + } else { + // There are two separate groups, one before this group and one + // after us (at the beginning). We're going to remove this group, + // but also the group at the very beginning. + IP->EndIdx = IN->EndIdx; + IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32; + IP->Repl32Coalesced = true; + I = BitGroups.erase(I); + BitGroups.erase(BitGroups.begin()); + } + + // This must be the last group in the vector (and we might have + // just invalidated the iterator above), so break here. + break; + } + } + } + + ++I; + } + } + + SDValue getI32Imm(unsigned Imm, SDLoc dl) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + } + + uint64_t getZerosMask() { + uint64_t Mask = 0; + for (unsigned i = 0; i < Bits.size(); ++i) { + if (Bits[i].hasValue()) + continue; + Mask |= (UINT64_C(1) << i); + } + + return ~Mask; + } + + // Depending on the number of groups for a particular value, it might be + // better to rotate, mask explicitly (using andi/andis), and then or the + // result. Select this part of the result first. + void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) { + if (BPermRewriterNoMasking) + return; + + for (ValueRotInfo &VRI : ValueRotsVec) { + unsigned Mask = 0; + for (unsigned i = 0; i < Bits.size(); ++i) { + if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V) + continue; + if (RLAmt[i] != VRI.RLAmt) + continue; + Mask |= (1u << i); + } + + // Compute the masks for andi/andis that would be necessary. + unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16; + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in mask for value bit groups"); + bool NeedsRotate = VRI.RLAmt != 0; + + // We're trying to minimize the number of instructions. If we have one + // group, using one of andi/andis can break even. If we have three + // groups, we can use both andi and andis and break even (to use both + // andi and andis we also need to or the results together). We need four + // groups if we also need to rotate. To use andi/andis we need to do more + // than break even because rotate-and-mask instructions tend to be easier + // to schedule. + + // FIXME: We've biased here against using andi/andis, which is right for + // POWER cores, but not optimal everywhere. For example, on the A2, + // andi/andis have single-cycle latency whereas the rotate-and-mask + // instructions take two cycles, and it would be better to bias toward + // andi/andis in break-even cases. + + unsigned NumAndInsts = (unsigned) NeedsRotate + + (unsigned) (ANDIMask != 0) + + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0) + + (unsigned) (bool) Res; + + DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << + " RL: " << VRI.RLAmt << ":" << + "\n\t\t\tisel using masking: " << NumAndInsts << + " using rotates: " << VRI.NumGroups << "\n"); + + if (NumAndInsts >= VRI.NumGroups) + continue; + + DEBUG(dbgs() << "\t\t\t\tusing masking\n"); + + if (InstCnt) *InstCnt += NumAndInsts; + + SDValue VRot; + if (VRI.RLAmt) { + SDValue Ops[] = + { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl), + getI32Imm(31, dl) }; + VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + Ops), 0); + } else { + VRot = VRI.V; + } + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, + VRot, getI32Imm(ANDIMask, dl)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, + VRot, getI32Imm(ANDISMask, dl)), 0); + + SDValue TotalVal; + if (!ANDIVal) + TotalVal = ANDISVal; + else if (!ANDISVal) + TotalVal = ANDIVal; + else + TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, + ANDIVal, ANDISVal), 0); + + if (!Res) + Res = TotalVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, + Res, TotalVal), 0); + + // Now, remove all groups with this underlying value and rotation + // factor. + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; + }); + } + } + + // Instruction selection for the 32-bit case. + SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) { + SDLoc dl(N); + SDValue Res; + + if (InstCnt) *InstCnt = 0; + + // Take care of cases that should use andi/andis first. + SelectAndParts32(dl, Res, InstCnt); + + // If we've not yet selected a 'starting' instruction, and we have no zeros + // to fill in, select the (Value, RLAmt) with the highest priority (largest + // number of groups), and start with this rotated value. + if ((!HasZeros || LateMask) && !Res) { + ValueRotInfo &VRI = ValueRotsVec[0]; + if (VRI.RLAmt) { + if (InstCnt) *InstCnt += 1; + SDValue Ops[] = + { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl), + getI32Imm(31, dl) }; + Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), + 0); + } else { + Res = VRI.V; + } + + // Now, remove all groups with this underlying value and rotation factor. + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; + }); + } + + if (InstCnt) *InstCnt += BitGroups.size(); + + // Insert the other groups (one at a time). + for (auto &BG : BitGroups) { + if (!Res) { + SDValue Ops[] = + { BG.V, getI32Imm(BG.RLAmt, dl), + getI32Imm(Bits.size() - BG.EndIdx - 1, dl), + getI32Imm(Bits.size() - BG.StartIdx - 1, dl) }; + Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + } else { + SDValue Ops[] = + { Res, BG.V, getI32Imm(BG.RLAmt, dl), + getI32Imm(Bits.size() - BG.EndIdx - 1, dl), + getI32Imm(Bits.size() - BG.StartIdx - 1, dl) }; + Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0); + } + } + + if (LateMask) { + unsigned Mask = (unsigned) getZerosMask(); + + unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16; + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in zeros mask?"); + + if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, + Res, getI32Imm(ANDIMask, dl)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, + Res, getI32Imm(ANDISMask, dl)), 0); + + if (!ANDIVal) + Res = ANDISVal; + else if (!ANDISVal) + Res = ANDIVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, + ANDIVal, ANDISVal), 0); + } + + return Res.getNode(); + } + + unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32, + unsigned MaskStart, unsigned MaskEnd, + bool IsIns) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (Repl32) + return 1; + + if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) || + InstMaskEnd == 63 - RLAmt) + return 1; + + return 2; + } + + // For 64-bit values, not all combinations of rotates and masks are + // available. Produce one if it is available. + SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32, + unsigned MaskStart, unsigned MaskEnd, + unsigned *InstCnt = nullptr) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (InstCnt) *InstCnt += 1; + + if (Repl32) { + // This rotation amount assumes that the lower 32 bits of the quantity + // are replicated in the high 32 bits by the rotation operator (which is + // done by rlwinm and friends). + assert(InstMaskStart >= 32 && "Mask cannot start out of range"); + assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); + SDValue Ops[] = + { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), + getI32Imm(InstMaskEnd - 32, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64, + Ops), 0); + } + + if (InstMaskEnd == 63) { + SDValue Ops[] = + { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0); + } + + if (InstMaskStart == 0) { + SDValue Ops[] = + { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0); + } + + if (InstMaskEnd == 63 - RLAmt) { + SDValue Ops[] = + { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0); + } + + // We cannot do this with a single instruction, so we'll use two. The + // problem is that we're not free to choose both a rotation amount and mask + // start and end independently. We can choose an arbitrary mask start and + // end, but then the rotation amount is fixed. Rotation, however, can be + // inverted, and so by applying an "inverse" rotation first, we can get the + // desired result. + if (InstCnt) *InstCnt += 1; + + // The rotation mask for the second instruction must be MaskStart. + unsigned RLAmt2 = MaskStart; + // The first instruction must rotate V so that the overall rotation amount + // is RLAmt. + unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; + if (RLAmt1) + V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); + return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd); + } + + // For 64-bit values, not all combinations of rotates and masks are + // available. Produce a rotate-mask-and-insert if one is available. + SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt, + bool Repl32, unsigned MaskStart, + unsigned MaskEnd, unsigned *InstCnt = nullptr) { + // In the notation used by the instructions, 'start' and 'end' are reversed + // because bits are counted from high to low order. + unsigned InstMaskStart = 64 - MaskEnd - 1, + InstMaskEnd = 64 - MaskStart - 1; + + if (InstCnt) *InstCnt += 1; + + if (Repl32) { + // This rotation amount assumes that the lower 32 bits of the quantity + // are replicated in the high 32 bits by the rotation operator (which is + // done by rlwinm and friends). + assert(InstMaskStart >= 32 && "Mask cannot start out of range"); + assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); + SDValue Ops[] = + { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), + getI32Imm(InstMaskEnd - 32, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64, + Ops), 0); + } + + if (InstMaskEnd == 63 - RLAmt) { + SDValue Ops[] = + { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0); + } + + // We cannot do this with a single instruction, so we'll use two. The + // problem is that we're not free to choose both a rotation amount and mask + // start and end independently. We can choose an arbitrary mask start and + // end, but then the rotation amount is fixed. Rotation, however, can be + // inverted, and so by applying an "inverse" rotation first, we can get the + // desired result. + if (InstCnt) *InstCnt += 1; + + // The rotation mask for the second instruction must be MaskStart. + unsigned RLAmt2 = MaskStart; + // The first instruction must rotate V so that the overall rotation amount + // is RLAmt. + unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; + if (RLAmt1) + V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); + return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd); + } + + void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) { + if (BPermRewriterNoMasking) + return; + + // The idea here is the same as in the 32-bit version, but with additional + // complications from the fact that Repl32 might be true. Because we + // aggressively convert bit groups to Repl32 form (which, for small + // rotation factors, involves no other change), and then coalesce, it might + // be the case that a single 64-bit masking operation could handle both + // some Repl32 groups and some non-Repl32 groups. If converting to Repl32 + // form allowed coalescing, then we must use a 32-bit rotaton in order to + // completely capture the new combined bit group. + + for (ValueRotInfo &VRI : ValueRotsVec) { + uint64_t Mask = 0; + + // We need to add to the mask all bits from the associated bit groups. + // If Repl32 is false, we need to add bits from bit groups that have + // Repl32 true, but are trivially convertable to Repl32 false. Such a + // group is trivially convertable if it overlaps only with the lower 32 + // bits, and the group has not been coalesced. + auto MatchingBG = [VRI](const BitGroup &BG) { + if (VRI.V != BG.V) + return false; + + unsigned EffRLAmt = BG.RLAmt; + if (!VRI.Repl32 && BG.Repl32) { + if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx && + !BG.Repl32Coalesced) { + if (BG.Repl32CR) + EffRLAmt += 32; + } else { + return false; + } + } else if (VRI.Repl32 != BG.Repl32) { + return false; + } + + return VRI.RLAmt == EffRLAmt; + }; + + for (auto &BG : BitGroups) { + if (!MatchingBG(BG)) + continue; + + if (BG.StartIdx <= BG.EndIdx) { + for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) + Mask |= (UINT64_C(1) << i); + } else { + for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) + Mask |= (UINT64_C(1) << i); + for (unsigned i = 0; i <= BG.EndIdx; ++i) + Mask |= (UINT64_C(1) << i); + } + } + + // We can use the 32-bit andi/andis technique if the mask does not + // require any higher-order bits. This can save an instruction compared + // to always using the general 64-bit technique. + bool Use32BitInsts = isUInt<32>(Mask); + // Compute the masks for andi/andis that would be necessary. + unsigned ANDIMask = (Mask & UINT16_MAX), + ANDISMask = (Mask >> 16) & UINT16_MAX; + + bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)); + + unsigned NumAndInsts = (unsigned) NeedsRotate + + (unsigned) (bool) Res; + if (Use32BitInsts) + NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + else + NumAndInsts += SelectInt64Count(Mask) + /* and */ 1; + + unsigned NumRLInsts = 0; + bool FirstBG = true; + for (auto &BG : BitGroups) { + if (!MatchingBG(BG)) + continue; + NumRLInsts += + SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx, + !FirstBG); + FirstBG = false; + } + + DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << + " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") << + "\n\t\t\tisel using masking: " << NumAndInsts << + " using rotates: " << NumRLInsts << "\n"); + + // When we'd use andi/andis, we bias toward using the rotates (andi only + // has a record form, and is cracked on POWER cores). However, when using + // general 64-bit constant formation, bias toward the constant form, + // because that exposes more opportunities for CSE. + if (NumAndInsts > NumRLInsts) + continue; + if (Use32BitInsts && NumAndInsts == NumRLInsts) + continue; + + DEBUG(dbgs() << "\t\t\t\tusing masking\n"); + + if (InstCnt) *InstCnt += NumAndInsts; + + SDValue VRot; + // We actually need to generate a rotation if we have a non-zero rotation + // factor or, in the Repl32 case, if we care about any of the + // higher-order replicated bits. In the latter case, we generate a mask + // backward so that it actually includes the entire 64 bits. + if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask))) + VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, + VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63); + else + VRot = VRI.V; + + SDValue TotalVal; + if (Use32BitInsts) { + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in mask when using 32-bit ands for 64-bit value"); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, + VRot, getI32Imm(ANDIMask, dl)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, + VRot, getI32Imm(ANDISMask, dl)), 0); + + if (!ANDIVal) + TotalVal = ANDISVal; + else if (!ANDISVal) + TotalVal = ANDIVal; + else + TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + ANDIVal, ANDISVal), 0); + } else { + TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0); + TotalVal = + SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + VRot, TotalVal), 0); + } + + if (!Res) + Res = TotalVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + Res, TotalVal), 0); + + // Now, remove all groups with this underlying value and rotation + // factor. + eraseMatchingBitGroups(MatchingBG); + } + } + + // Instruction selection for the 64-bit case. + SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) { + SDLoc dl(N); + SDValue Res; + + if (InstCnt) *InstCnt = 0; + + // Take care of cases that should use andi/andis first. + SelectAndParts64(dl, Res, InstCnt); + + // If we've not yet selected a 'starting' instruction, and we have no zeros + // to fill in, select the (Value, RLAmt) with the highest priority (largest + // number of groups), and start with this rotated value. + if ((!HasZeros || LateMask) && !Res) { + // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32 + // groups will come first, and so the VRI representing the largest number + // of groups might not be first (it might be the first Repl32 groups). + unsigned MaxGroupsIdx = 0; + if (!ValueRotsVec[0].Repl32) { + for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i) + if (ValueRotsVec[i].Repl32) { + if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups) + MaxGroupsIdx = i; + break; + } + } + + ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx]; + bool NeedsRotate = false; + if (VRI.RLAmt) { + NeedsRotate = true; + } else if (VRI.Repl32) { + for (auto &BG : BitGroups) { + if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt || + BG.Repl32 != VRI.Repl32) + continue; + + // We don't need a rotate if the bit group is confined to the lower + // 32 bits. + if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx) + continue; + + NeedsRotate = true; + break; + } + } + + if (NeedsRotate) + Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, + VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63, + InstCnt); + else + Res = VRI.V; + + // Now, remove all groups with this underlying value and rotation factor. + if (Res) + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt && + BG.Repl32 == VRI.Repl32; + }); + } + + // Because 64-bit rotates are more flexible than inserts, we might have a + // preference regarding which one we do first (to save one instruction). + if (!Res) + for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) { + if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, + false) < + SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, + true)) { + if (I != BitGroups.begin()) { + BitGroup BG = *I; + BitGroups.erase(I); + BitGroups.insert(BitGroups.begin(), BG); + } + + break; + } + } + + // Insert the other groups (one at a time). + for (auto &BG : BitGroups) { + if (!Res) + Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx, + BG.EndIdx, InstCnt); + else + Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32, + BG.StartIdx, BG.EndIdx, InstCnt); + } + + if (LateMask) { + uint64_t Mask = getZerosMask(); + + // We can use the 32-bit andi/andis technique if the mask does not + // require any higher-order bits. This can save an instruction compared + // to always using the general 64-bit technique. + bool Use32BitInsts = isUInt<32>(Mask); + // Compute the masks for andi/andis that would be necessary. + unsigned ANDIMask = (Mask & UINT16_MAX), + ANDISMask = (Mask >> 16) & UINT16_MAX; + + if (Use32BitInsts) { + assert((ANDIMask != 0 || ANDISMask != 0) && + "No set bits in mask when using 32-bit ands for 64-bit value"); + + if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + + (unsigned) (ANDISMask != 0) + + (unsigned) (ANDIMask != 0 && ANDISMask != 0); + + SDValue ANDIVal, ANDISVal; + if (ANDIMask != 0) + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, + Res, getI32Imm(ANDIMask, dl)), 0); + if (ANDISMask != 0) + ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, + Res, getI32Imm(ANDISMask, dl)), 0); + + if (!ANDIVal) + Res = ANDISVal; + else if (!ANDISVal) + Res = ANDIVal; + else + Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, + ANDIVal, ANDISVal), 0); + } else { + if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1; + + SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0); + Res = + SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + Res, MaskVal), 0); + } + } + + return Res.getNode(); + } + + SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) { + // Fill in BitGroups. + collectBitGroups(LateMask); + if (BitGroups.empty()) + return nullptr; + + // For 64-bit values, figure out when we can use 32-bit instructions. + if (Bits.size() == 64) + assignRepl32BitGroups(); + + // Fill in ValueRotsVec. + collectValueRotInfo(); + + if (Bits.size() == 32) { + return Select32(N, LateMask, InstCnt); + } else { + assert(Bits.size() == 64 && "Not 64 bits here?"); + return Select64(N, LateMask, InstCnt); + } + + return nullptr; + } + + void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) { + BitGroups.erase(std::remove_if(BitGroups.begin(), BitGroups.end(), F), + BitGroups.end()); + } + + SmallVector<ValueBit, 64> Bits; + + bool HasZeros; + SmallVector<unsigned, 64> RLAmt; + + SmallVector<BitGroup, 16> BitGroups; + + DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots; + SmallVector<ValueRotInfo, 16> ValueRotsVec; + + SelectionDAG *CurDAG; + +public: + BitPermutationSelector(SelectionDAG *DAG) + : CurDAG(DAG) {} + + // Here we try to match complex bit permutations into a set of + // rotate-and-shift/shift/and/or instructions, using a set of heuristics + // known to produce optimial code for common cases (like i32 byte swapping). + SDNode *Select(SDNode *N) { + Bits.resize(N->getValueType(0).getSizeInBits()); + if (!getValueBits(SDValue(N, 0), Bits)) + return nullptr; + + DEBUG(dbgs() << "Considering bit-permutation-based instruction" + " selection for: "); + DEBUG(N->dump(CurDAG)); + + // Fill it RLAmt and set HasZeros. + computeRotationAmounts(); + + if (!HasZeros) + return Select(N, false); + + // We currently have two techniques for handling results with zeros: early + // masking (the default) and late masking. Late masking is sometimes more + // efficient, but because the structure of the bit groups is different, it + // is hard to tell without generating both and comparing the results. With + // late masking, we ignore zeros in the resulting value when inserting each + // set of bit groups, and then mask in the zeros at the end. With early + // masking, we only insert the non-zero parts of the result at every step. + + unsigned InstCnt, InstCntLateMask; + DEBUG(dbgs() << "\tEarly masking:\n"); + SDNode *RN = Select(N, false, &InstCnt); + DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n"); + + DEBUG(dbgs() << "\tLate masking:\n"); + SDNode *RNLM = Select(N, true, &InstCntLateMask); + DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask << + " instructions\n"); + + if (InstCnt <= InstCntLateMask) { + DEBUG(dbgs() << "\tUsing early-masking for isel\n"); + return RN; + } + + DEBUG(dbgs() << "\tUsing late-masking for isel\n"); + return RNLM; + } +}; +} // anonymous namespace + +SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) { + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return nullptr; + + if (!UseBitPermRewriter) + return nullptr; + + switch (N->getOpcode()) { + default: break; + case ISD::ROTL: + case ISD::SHL: + case ISD::SRL: + case ISD::AND: + case ISD::OR: { + BitPermutationSelector BPS(CurDAG); + return BPS.Select(N); + } + } + + return nullptr; +} + +/// SelectCC - Select a comparison of the specified values with the specified +/// condition code, returning the CR# of the expression. +SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDLoc dl) { + // Always select the LHS. + unsigned Opc; + + if (LHS.getValueType() == MVT::i32) { + unsigned Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt32Immediate(RHS, Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>((int)Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpw cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmplwi cr0,r0,0x5678 + // beq cr0,L6 + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS, + getI32Imm(Imm >> 16, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor, + getI32Imm(Imm & 0xFFFF, dl)), 0); + } + Opc = PPC::CMPLW; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF, dl)), 0); + Opc = PPC::CMPLW; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm((int)SImm & 0xFFFF, + dl)), + 0); + Opc = PPC::CMPW; + } + } else if (LHS.getValueType() == MVT::i64) { + uint64_t Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt64Immediate(RHS.getNode(), Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF, dl)), + 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpd cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmpldi cr0,r0,0x5678 + // beq cr0,L6 + if (isUInt<32>(Imm)) { + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, + getI64Imm(Imm >> 16, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, + getI64Imm(Imm & 0xFFFF, dl)), + 0); + } + } + Opc = PPC::CMPLD; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI64Imm(Imm & 0xFFFF, dl)), 0); + Opc = PPC::CMPLD; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI64Imm(SImm & 0xFFFF, dl)), + 0); + Opc = PPC::CMPD; + } + } else if (LHS.getValueType() == MVT::f32) { + Opc = PPC::FCMPUS; + } else { + assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); + Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; + } + return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); +} + +static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { + switch (CC) { + case ISD::SETUEQ: + case ISD::SETONE: + case ISD::SETOLE: + case ISD::SETOGE: + llvm_unreachable("Should be lowered by legalize!"); + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOEQ: + case ISD::SETEQ: return PPC::PRED_EQ; + case ISD::SETUNE: + case ISD::SETNE: return PPC::PRED_NE; + case ISD::SETOLT: + case ISD::SETLT: return PPC::PRED_LT; + case ISD::SETULE: + case ISD::SETLE: return PPC::PRED_LE; + case ISD::SETOGT: + case ISD::SETGT: return PPC::PRED_GT; + case ISD::SETUGE: + case ISD::SETGE: return PPC::PRED_GE; + case ISD::SETO: return PPC::PRED_NU; + case ISD::SETUO: return PPC::PRED_UN; + // These two are invalid for floating point. Assume we have int. + case ISD::SETULT: return PPC::PRED_LT; + case ISD::SETUGT: return PPC::PRED_GT; + } +} + +/// getCRIdxForSetCC - Return the index of the condition register field +/// associated with the SetCC condition, and whether or not the field is +/// treated as inverted. That is, lt = 0; ge = 0 inverted. +static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { + Invert = false; + switch (CC) { + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOLT: + case ISD::SETLT: return 0; // Bit #0 = SETOLT + case ISD::SETOGT: + case ISD::SETGT: return 1; // Bit #1 = SETOGT + case ISD::SETOEQ: + case ISD::SETEQ: return 2; // Bit #2 = SETOEQ + case ISD::SETUO: return 3; // Bit #3 = SETUO + case ISD::SETUGE: + case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE + case ISD::SETULE: + case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE + case ISD::SETUNE: + case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE + case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO + case ISD::SETUEQ: + case ISD::SETOGE: + case ISD::SETOLE: + case ISD::SETONE: + llvm_unreachable("Invalid branch code: should be expanded by legalize"); + // These are invalid for floating point. Assume integer. + case ISD::SETULT: return 0; + case ISD::SETUGT: return 1; + } +} + +// getVCmpInst: return the vector compare instruction for the specified +// vector type and condition code. Since this is for altivec specific code, +// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). +static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, + bool HasVSX, bool &Swap, bool &Negate) { + Swap = false; + Negate = false; + + if (VecVT.isFloatingPoint()) { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETLE: CC = ISD::SETGE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break; + case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break; + case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break; + case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETOEQ: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPEQDP; + break; + case ISD::SETGT: + case ISD::SETOGT: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGTDP; + break; + case ISD::SETGE: + case ISD::SETOGE: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGEDP; + break; + default: + break; + } + llvm_unreachable("Invalid floating-point vector compare condition"); + } else { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETGE: CC = ISD::SETLE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break; + case ISD::SETLE: CC = ISD::SETGT; Negate = true; break; + case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETUEQ: + if (VecVT == MVT::v16i8) + return PPC::VCMPEQUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPEQUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPEQUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPEQUD; + break; + case ISD::SETGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTSB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTSH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTSW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTSD; + break; + case ISD::SETUGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTUW; + else if (VecVT == MVT::v2i64) + return PPC::VCMPGTUD; + break; + default: + break; + } + llvm_unreachable("Invalid integer vector compare condition"); + } +} + +SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) { + SDLoc dl(N); + unsigned Imm; + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + EVT PtrVT = + CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); + bool isPPC64 = (PtrVT == MVT::i64); + + if (!PPCSubTarget->useCRBits() && + isInt32Immediate(N->getOperand(1), Imm)) { + // We can codegen setcc op, imm very efficiently compared to a brcond. + // Check for those cases here. + // setcc op, 0 + if (Imm == 0) { + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: { + Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0); + SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETNE: { + if (isPPC64) break; + SDValue AD = + SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U, dl)), 0); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, + AD.getValue(1)); + } + case ISD::SETLT: { + SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETGT: { + SDValue T = + SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0); + T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0); + SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + } + } else if (Imm == ~0U) { // setcc op, -1 + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(1, dl)), 0); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDValue(CurDAG->getMachineNode(PPC::LI, dl, + MVT::i32, + getI32Imm(0, dl)), + 0), Op.getValue(1)); + case ISD::SETNE: { + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0); + SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U, dl)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), + Op, SDValue(AD, 1)); + } + case ISD::SETLT: { + SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op, + getI32Imm(1, dl)), 0); + SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD, + Op), 0); + SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETGT: { + SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), + getI32Imm(31, dl) }; + Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, + getI32Imm(1, dl)); + } + } + } + } + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Altivec Vector compare instructions do not set any CR register by default and + // vector compare operations return the same type as the operands. + if (LHS.getValueType().isVector()) { + if (PPCSubTarget->hasQPX()) + return nullptr; + + EVT VecVT = LHS.getValueType(); + bool Swap, Negate; + unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC, + PPCSubTarget->hasVSX(), Swap, Negate); + if (Swap) + std::swap(LHS, RHS); + + EVT ResVT = VecVT.changeVectorElementTypeToInteger(); + if (Negate) { + SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0); + return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : + PPC::VNOR, + ResVT, VCmp, VCmp); + } + + return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS); + } + + if (PPCSubTarget->useCRBits()) + return nullptr; + + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + SDValue CCReg = SelectCC(LHS, RHS, CC, dl); + SDValue IntCR; + + // Force the ccreg into CR7. + SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); + + SDValue InFlag(nullptr, 0); // Null incoming flag value. + CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, + InFlag).getValue(1); + + IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, + CCReg), 0); + + SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl), + getI32Imm(31, dl), getI32Imm(31, dl) }; + if (!Inv) + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + + // Get the specified bit. + SDValue Tmp = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl)); +} + +SDNode *PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); + return Result; +} + + +// Select - Convert the specified operand from a target-independent to a +// target-specific node if it hasn't already been changed. +SDNode *PPCDAGToDAGISel::Select(SDNode *N) { + SDLoc dl(N); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + // In case any misguided DAG-level optimizations form an ADD with a + // TargetConstant operand, crash here instead of miscompiling (by selecting + // an r+r add instead of some kind of r+i add). + if (N->getOpcode() == ISD::ADD && + N->getOperand(1).getOpcode() == ISD::TargetConstant) + llvm_unreachable("Invalid ADD with TargetConstant operand"); + + // Try matching complex bit permutations before doing anything else. + if (SDNode *NN = SelectBitPermutation(N)) + return NN; + + switch (N->getOpcode()) { + default: break; + + case ISD::Constant: { + if (N->getValueType(0) == MVT::i64) + return SelectInt64(CurDAG, N); + break; + } + + case ISD::SETCC: { + SDNode *SN = SelectSETCC(N); + if (SN) + return SN; + break; + } + case PPCISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case ISD::FrameIndex: + return getFrameIndex(N, N); + + case PPCISD::MFOCRF: { + SDValue InFlag = N->getOperand(1); + return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, + N->getOperand(0), InFlag); + } + + case PPCISD::READ_TIME_BASE: { + return CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32, + MVT::Other, N->getOperand(0)); + } + + case PPCISD::SRA_ADDZE: { + SDValue N0 = N->getOperand(0); + SDValue ShiftAmt = + CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))-> + getConstantIntValue(), dl, + N->getValueType(0)); + if (N->getValueType(0) == MVT::i64) { + SDNode *Op = + CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue, + N0, ShiftAmt); + return CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, + SDValue(Op, 0), SDValue(Op, 1)); + } else { + assert(N->getValueType(0) == MVT::i32 && + "Expecting i64 or i32 in PPCISD::SRA_ADDZE"); + SDNode *Op = + CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue, + N0, ShiftAmt); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDValue(Op, 0), SDValue(Op, 1)); + } + } + + case ISD::LOAD: { + // Handle preincrement loads. + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT LoadedVT = LD->getMemoryVT(); + + // Normal loads are handled by code generated from the .td file. + if (LD->getAddressingMode() != ISD::PRE_INC) + break; + + SDValue Offset = LD->getOffset(); + if (Offset.getOpcode() == ISD::TargetConstant || + Offset.getOpcode() == ISD::TargetGlobalAddress) { + + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::f64: Opcode = PPC::LFDU; break; + case MVT::f32: Opcode = PPC::LFSU; break; + case MVT::i32: Opcode = PPC::LWZU; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDU; break; + case MVT::i32: Opcode = PPC::LWZU8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU8; break; + } + } + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = { Offset, Base, Chain }; + return transferMemOperands( + N, CurDAG->getMachineNode( + Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, + Ops)); + } else { + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX + case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX + case MVT::f64: Opcode = PPC::LFDUX; break; + case MVT::f32: Opcode = PPC::LFSUX; break; + case MVT::i32: Opcode = PPC::LWZUX; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) && + "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDUX; break; + case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX8; break; + } + } + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = { Base, Offset, Chain }; + return transferMemOperands( + N, CurDAG->getMachineNode( + Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, + Ops)); + } + } + + case ISD::AND: { + unsigned Imm, Imm2, SH, MB, ME; + uint64_t Imm64; + + // If this is an and of a value rotated between 0 and 31 bits and then and'd + // with a mask, emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { + SDValue Val = N->getOperand(0).getOperand(0); + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + // If this is just a masked value where the input is not handled above, and + // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRunOfOnes(Imm, MB, ME) && + N->getOperand(0).getOpcode() != ISD::ROTL) { + SDValue Val = N->getOperand(0); + SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + // If this is a 64-bit zero-extension mask, emit rldicl. + if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && + isMask_64(Imm64)) { + SDValue Val = N->getOperand(0); + MB = 64 - countTrailingOnes(Imm64); + SH = 0; + + // If the operand is a logical right shift, we can fold it into this + // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) + // for n <= mb. The right shift is really a left rotate followed by a + // mask, and this mask is a more-restrictive sub-mask of the mask implied + // by the shift. + if (Val.getOpcode() == ISD::SRL && + isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { + assert(Imm < 64 && "Illegal shift amount"); + Val = Val.getOperand(0); + SH = 64 - Imm; + } + + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); + } + // AND X, 0 -> 0, not "rlwinm 32". + if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { + ReplaceUses(SDValue(N, 0), N->getOperand(1)); + return nullptr; + } + // ISD::OR doesn't get all the bitfield insertion fun. + // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a + // bitfield insert. + if (isInt32Immediate(N->getOperand(1), Imm) && + N->getOperand(0).getOpcode() == ISD::OR && + isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { + // The idea here is to check whether this is equivalent to: + // (c1 & m) | (x & ~m) + // where m is a run-of-ones mask. The logic here is that, for each bit in + // c1 and c2: + // - if both are 1, then the output will be 1. + // - if both are 0, then the output will be 0. + // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will + // come from x. + // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will + // be 0. + // If that last condition is never the case, then we can form m from the + // bits that are the same between c1 and c2. + unsigned MB, ME; + if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + N->getOperand(0).getOperand(1), + getI32Imm(0, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops); + } + } + + // Other cases are autogenerated. + break; + } + case ISD::OR: { + if (N->getValueType(0) == MVT::i32) + if (SDNode *I = SelectBitfieldInsert(N)) + return I; + + short Imm; + if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && + isIntS16Immediate(N->getOperand(1), Imm)) { + APInt LHSKnownZero, LHSKnownOne; + CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne); + + // If this is equivalent to an add, then we can fold it with the + // FrameIndex calculation. + if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) + return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); + } + + // Other cases are autogenerated. + break; + } + case ISD::ADD: { + short Imm; + if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && + isIntS16Immediate(N->getOperand(1), Imm)) + return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); + + break; + } + case ISD::SHL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + + // Other cases are autogenerated. + break; + } + case ISD::SRL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + + // Other cases are autogenerated. + break; + } + // FIXME: Remove this once the ANDI glue bug is fixed: + case PPCISD::ANDIo_1_EQ_BIT: + case PPCISD::ANDIo_1_GT_BIT: { + if (!ANDIGlueBug) + break; + + EVT InVT = N->getOperand(0).getValueType(); + assert((InVT == MVT::i64 || InVT == MVT::i32) && + "Invalid input type for ANDIo_1_EQ_BIT"); + + unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo; + SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue, + N->getOperand(0), + CurDAG->getTargetConstant(1, dl, InVT)), + 0); + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue SRIdxVal = + CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ? + PPC::sub_eq : PPC::sub_gt, dl, MVT::i32); + + return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, + CR0Reg, SRIdxVal, + SDValue(AndI.getNode(), 1) /* glue */); + } + case ISD::SELECT_CC: { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + EVT PtrVT = + CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); + bool isPPC64 = (PtrVT == MVT::i64); + + // If this is a select of i1 operands, we'll pattern match it. + if (PPCSubTarget->useCRBits() && + N->getOperand(0).getValueType() == MVT::i1) + break; + + // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc + if (!isPPC64) + if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1))) + if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3))) + if (N1C->isNullValue() && N3C->isNullValue() && + N2C->getZExtValue() == 1ULL && CC == ISD::SETNE && + // FIXME: Implement this optzn for PPC64. + N->getValueType(0) == MVT::i32) { + SDNode *Tmp = + CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + N->getOperand(0), getI32Imm(~0U, dl)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, + SDValue(Tmp, 0), N->getOperand(0), + SDValue(Tmp, 1)); + } + + SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl); + + if (N->getValueType(0) == MVT::i1) { + // An i1 select is: (c & t) | (!c & f). + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + + unsigned SRI; + switch (Idx) { + default: llvm_unreachable("Invalid CC index"); + case 0: SRI = PPC::sub_lt; break; + case 1: SRI = PPC::sub_gt; break; + case 2: SRI = PPC::sub_eq; break; + case 3: SRI = PPC::sub_un; break; + } + + SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg); + + SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1, + CCBit, CCBit), 0); + SDValue C = Inv ? NotCCBit : CCBit, + NotC = Inv ? CCBit : NotCCBit; + + SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, + C, N->getOperand(2)), 0); + SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, + NotC, N->getOperand(3)), 0); + + return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF); + } + + unsigned BROpc = getPredicateForSetCC(CC); + + unsigned SelectCCOp; + if (N->getValueType(0) == MVT::i32) + SelectCCOp = PPC::SELECT_CC_I4; + else if (N->getValueType(0) == MVT::i64) + SelectCCOp = PPC::SELECT_CC_I8; + else if (N->getValueType(0) == MVT::f32) + if (PPCSubTarget->hasP8Vector()) + SelectCCOp = PPC::SELECT_CC_VSSRC; + else + SelectCCOp = PPC::SELECT_CC_F4; + else if (N->getValueType(0) == MVT::f64) + if (PPCSubTarget->hasVSX()) + SelectCCOp = PPC::SELECT_CC_VSFRC; + else + SelectCCOp = PPC::SELECT_CC_F8; + else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64) + SelectCCOp = PPC::SELECT_CC_QFRC; + else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32) + SelectCCOp = PPC::SELECT_CC_QSRC; + else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1) + SelectCCOp = PPC::SELECT_CC_QBRC; + else if (N->getValueType(0) == MVT::v2f64 || + N->getValueType(0) == MVT::v2i64) + SelectCCOp = PPC::SELECT_CC_VSRC; + else + SelectCCOp = PPC::SELECT_CC_VRRC; + + SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3), + getI32Imm(BROpc, dl) }; + return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops); + } + case ISD::VSELECT: + if (PPCSubTarget->hasVSX()) { + SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops); + } + + break; + case ISD::VECTOR_SHUFFLE: + if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || + N->getValueType(0) == MVT::v2i64)) { + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + + SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), + Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); + unsigned DM[2]; + + for (int i = 0; i < 2; ++i) + if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2) + DM[i] = 0; + else + DM[i] = 1; + + if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 && + Op1.getOpcode() == ISD::SCALAR_TO_VECTOR && + isa<LoadSDNode>(Op1.getOperand(0))) { + LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0)); + SDValue Base, Offset; + + if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() && + (LD->getMemoryVT() == MVT::f64 || + LD->getMemoryVT() == MVT::i64) && + SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { + SDValue Chain = LD->getChain(); + SDValue Ops[] = { Base, Offset, Chain }; + return CurDAG->SelectNodeTo(N, PPC::LXVDSX, + N->getValueType(0), Ops); + } + } + + // For little endian, we must swap the input operands and adjust + // the mask elements (reverse and invert them). + if (PPCSubTarget->isLittleEndian()) { + std::swap(Op1, Op2); + unsigned tmp = DM[0]; + DM[0] = 1 - DM[1]; + DM[1] = 1 - tmp; + } + + SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl, + MVT::i32); + SDValue Ops[] = { Op1, Op2, DMV }; + return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops); + } + + break; + case PPCISD::BDNZ: + case PPCISD::BDZ: { + bool IsPPC64 = PPCSubTarget->isPPC64(); + SDValue Ops[] = { N->getOperand(1), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ? + (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (IsPPC64 ? PPC::BDZ8 : PPC::BDZ), + MVT::Other, Ops); + } + case PPCISD::COND_BRANCH: { + // Op #0 is the Chain. + // Op #1 is the PPC::PRED_* number. + // Op #2 is the CR# + // Op #3 is the Dest MBB + // Op #4 is the Flag. + // Prevent PPC::PRED_* from being selected into LI. + unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3)); + + SDValue Pred = getI32Imm(PCC, dl); + SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), + N->getOperand(0), N->getOperand(4) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); + } + case ISD::BR_CC: { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + unsigned PCC = getPredicateForSetCC(CC); + + if (N->getOperand(2).getValueType() == MVT::i1) { + unsigned Opc; + bool Swap; + switch (PCC) { + default: llvm_unreachable("Unexpected Boolean-operand predicate"); + case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break; + case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break; + case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break; + case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break; + case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break; + case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break; + } + + SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1, + N->getOperand(Swap ? 3 : 2), + N->getOperand(Swap ? 2 : 3)), 0); + return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, + BitComp, N->getOperand(4), N->getOperand(0)); + } + + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4)); + + SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); + SDValue Ops[] = { getI32Imm(PCC, dl), CondCode, + N->getOperand(4), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); + } + case ISD::BRIND: { + // FIXME: Should custom lower this. + SDValue Chain = N->getOperand(0); + SDValue Target = N->getOperand(1); + unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8; + unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8; + Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target, + Chain), 0); + return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain); + } + case PPCISD::TOC_ENTRY: { + assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) && + "Only supported for 64-bit ABI and 32-bit SVR4"); + if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) { + SDValue GA = N->getOperand(0); + return transferMemOperands(N, CurDAG->getMachineNode(PPC::LWZtoc, dl, + MVT::i32, GA, N->getOperand(1))); + } + + // For medium and large code model, we generate two instructions as + // described below. Otherwise we allow SelectCodeCommon to handle this, + // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. + CodeModel::Model CModel = TM.getCodeModel(); + if (CModel != CodeModel::Medium && CModel != CodeModel::Large) + break; + + // The first source operand is a TargetGlobalAddress or a TargetJumpTable. + // If it must be toc-referenced according to PPCSubTarget, we generate: + // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) + // Otherwise we generate: + // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) + SDValue GA = N->getOperand(0); + SDValue TOCbase = N->getOperand(1); + SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, + TOCbase, GA); + + if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) || + CModel == CodeModel::Large) + return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl, + MVT::i64, GA, SDValue(Tmp, 0))); + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { + const GlobalValue *GV = G->getGlobal(); + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { + return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl, + MVT::i64, GA, SDValue(Tmp, 0))); + } + } + + return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, + SDValue(Tmp, 0), GA); + } + case PPCISD::PPC32_PICGOT: { + // Generate a PIC-safe GOT reference. + assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && + "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); + return CurDAG->SelectNodeTo( + N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()), + MVT::i32); + } + case PPCISD::VADD_SPLAT: { + // This expands into one of three sequences, depending on whether + // the first operand is odd or even, positive or negative. + assert(isa<ConstantSDNode>(N->getOperand(0)) && + isa<ConstantSDNode>(N->getOperand(1)) && + "Invalid operand on VADD_SPLAT!"); + + int Elt = N->getConstantOperandVal(0); + int EltSize = N->getConstantOperandVal(1); + unsigned Opc1, Opc2, Opc3; + EVT VT; + + if (EltSize == 1) { + Opc1 = PPC::VSPLTISB; + Opc2 = PPC::VADDUBM; + Opc3 = PPC::VSUBUBM; + VT = MVT::v16i8; + } else if (EltSize == 2) { + Opc1 = PPC::VSPLTISH; + Opc2 = PPC::VADDUHM; + Opc3 = PPC::VSUBUHM; + VT = MVT::v8i16; + } else { + assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!"); + Opc1 = PPC::VSPLTISW; + Opc2 = PPC::VADDUWM; + Opc3 = PPC::VSUBUWM; + VT = MVT::v4i32; + } + + if ((Elt & 1) == 0) { + // Elt is even, in the range [-32,-18] + [16,30]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp = VSPLTIS[BHW] elt + // VADDU[BHW]M tmp, tmp + // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 + SDValue EltVal = getI32Imm(Elt >> 1, dl); + SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + SDValue TmpVal = SDValue(Tmp, 0); + return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal); + + } else if (Elt > 0) { + // Elt is odd and positive, in the range [17,31]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt-16 + // tmp2 = VSPLTIS[BHW] -16 + // VSUBU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt - 16, dl); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16, dl); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + + } else { + // Elt is odd and negative, in the range [-31,-17]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt+16 + // tmp2 = VSPLTIS[BHW] -16 + // VADDU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt + 16, dl); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16, dl); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + } + } + } + + return SelectCode(N); +} + +// If the target supports the cmpb instruction, do the idiom recognition here. +// We don't do this as a DAG combine because we don't want to do it as nodes +// are being combined (because we might miss part of the eventual idiom). We +// don't want to do it during instruction selection because we want to reuse +// the logic for lowering the masking operations already part of the +// instruction selector. +SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { + SDLoc dl(N); + + assert(N->getOpcode() == ISD::OR && + "Only OR nodes are supported for CMPB"); + + SDValue Res; + if (!PPCSubTarget->hasCMPB()) + return Res; + + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return Res; + + EVT VT = N->getValueType(0); + + SDValue RHS, LHS; + bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint64_t Mask = 0, Alt = 0; + + auto IsByteSelectCC = [this](SDValue O, unsigned &b, + uint64_t &Mask, uint64_t &Alt, + SDValue &LHS, SDValue &RHS) { + if (O.getOpcode() != ISD::SELECT_CC) + return false; + ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get(); + + if (!isa<ConstantSDNode>(O.getOperand(2)) || + !isa<ConstantSDNode>(O.getOperand(3))) + return false; + + uint64_t PM = O.getConstantOperandVal(2); + uint64_t PAlt = O.getConstantOperandVal(3); + for (b = 0; b < 8; ++b) { + uint64_t Mask = UINT64_C(0xFF) << (8*b); + if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt) + break; + } + + if (b == 8) + return false; + Mask |= PM; + Alt |= PAlt; + + if (!isa<ConstantSDNode>(O.getOperand(1)) || + O.getConstantOperandVal(1) != 0) { + SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1); + if (Op0.getOpcode() == ISD::TRUNCATE) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::TRUNCATE) + Op1 = Op1.getOperand(0); + + if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL && + Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ && + isa<ConstantSDNode>(Op0.getOperand(1))) { + + unsigned Bits = Op0.getValueType().getSizeInBits(); + if (b != Bits/8-1) + return false; + if (Op0.getConstantOperandVal(1) != Bits-8) + return false; + + LHS = Op0.getOperand(0); + RHS = Op1.getOperand(0); + return true; + } + + // When we have small integers (i16 to be specific), the form present + // post-legalization uses SETULT in the SELECT_CC for the + // higher-order byte, depending on the fact that the + // even-higher-order bytes are known to all be zero, for example: + // select_cc (xor $lhs, $rhs), 256, 65280, 0, setult + // (so when the second byte is the same, because all higher-order + // bits from bytes 3 and 4 are known to be zero, the result of the + // xor can be at most 255) + if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT && + isa<ConstantSDNode>(O.getOperand(1))) { + + uint64_t ULim = O.getConstantOperandVal(1); + if (ULim != (UINT64_C(1) << b*8)) + return false; + + // Now we need to make sure that the upper bytes are known to be + // zero. + unsigned Bits = Op0.getValueType().getSizeInBits(); + if (!CurDAG->MaskedValueIsZero(Op0, + APInt::getHighBitsSet(Bits, Bits - (b+1)*8))) + return false; + + LHS = Op0.getOperand(0); + RHS = Op0.getOperand(1); + return true; + } + + return false; + } + + if (CC != ISD::SETEQ) + return false; + + SDValue Op = O.getOperand(0); + if (Op.getOpcode() == ISD::AND) { + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return false; + if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b))) + return false; + + SDValue XOR = Op.getOperand(0); + if (XOR.getOpcode() == ISD::TRUNCATE) + XOR = XOR.getOperand(0); + if (XOR.getOpcode() != ISD::XOR) + return false; + + LHS = XOR.getOperand(0); + RHS = XOR.getOperand(1); + return true; + } else if (Op.getOpcode() == ISD::SRL) { + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return false; + unsigned Bits = Op.getValueType().getSizeInBits(); + if (b != Bits/8-1) + return false; + if (Op.getConstantOperandVal(1) != Bits-8) + return false; + + SDValue XOR = Op.getOperand(0); + if (XOR.getOpcode() == ISD::TRUNCATE) + XOR = XOR.getOperand(0); + if (XOR.getOpcode() != ISD::XOR) + return false; + + LHS = XOR.getOperand(0); + RHS = XOR.getOperand(1); + return true; + } + + return false; + }; + + SmallVector<SDValue, 8> Queue(1, SDValue(N, 0)); + while (!Queue.empty()) { + SDValue V = Queue.pop_back_val(); + + for (const SDValue &O : V.getNode()->ops()) { + unsigned b; + uint64_t M = 0, A = 0; + SDValue OLHS, ORHS; + if (O.getOpcode() == ISD::OR) { + Queue.push_back(O); + } else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) { + if (!LHS) { + LHS = OLHS; + RHS = ORHS; + BytesFound[b] = true; + Mask |= M; + Alt |= A; + } else if ((LHS == ORHS && RHS == OLHS) || + (RHS == ORHS && LHS == OLHS)) { + BytesFound[b] = true; + Mask |= M; + Alt |= A; + } else { + return Res; + } + } else { + return Res; + } + } + } + + unsigned LastB = 0, BCnt = 0; + for (unsigned i = 0; i < 8; ++i) + if (BytesFound[LastB]) { + ++BCnt; + LastB = i; + } + + if (!LastB || BCnt < 2) + return Res; + + // Because we'll be zero-extending the output anyway if don't have a specific + // value for each input byte (via the Mask), we can 'anyext' the inputs. + if (LHS.getValueType() != VT) { + LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT); + RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT); + } + + Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS); + + bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1); + if (NonTrivialMask && !Alt) { + // Res = Mask & CMPB + Res = CurDAG->getNode(ISD::AND, dl, VT, Res, + CurDAG->getConstant(Mask, dl, VT)); + } else if (Alt) { + // Res = (CMPB & Mask) | (~CMPB & Alt) + // Which, as suggested here: + // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge + // can be written as: + // Res = Alt ^ ((Alt ^ Mask) & CMPB) + // useful because the (Alt ^ Mask) can be pre-computed. + Res = CurDAG->getNode(ISD::AND, dl, VT, Res, + CurDAG->getConstant(Mask ^ Alt, dl, VT)); + Res = CurDAG->getNode(ISD::XOR, dl, VT, Res, + CurDAG->getConstant(Alt, dl, VT)); + } + + return Res; +} + +// When CR bit registers are enabled, an extension of an i1 variable to a i32 +// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus +// involves constant materialization of a 0 or a 1 or both. If the result of +// the extension is then operated upon by some operator that can be constant +// folded with a constant 0 or 1, and that constant can be materialized using +// only one instruction (like a zero or one), then we should fold in those +// operations with the select. +void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) { + if (!PPCSubTarget->useCRBits()) + return; + + if (N->getOpcode() != ISD::ZERO_EXTEND && + N->getOpcode() != ISD::SIGN_EXTEND && + N->getOpcode() != ISD::ANY_EXTEND) + return; + + if (N->getOperand(0).getValueType() != MVT::i1) + return; + + if (!N->hasOneUse()) + return; + + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Cond = N->getOperand(0); + SDValue ConstTrue = + CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, dl, VT); + SDValue ConstFalse = CurDAG->getConstant(0, dl, VT); + + do { + SDNode *User = *N->use_begin(); + if (User->getNumOperands() != 2) + break; + + auto TryFold = [this, N, User, dl](SDValue Val) { + SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1); + SDValue O0 = UserO0.getNode() == N ? Val : UserO0; + SDValue O1 = UserO1.getNode() == N ? Val : UserO1; + + return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl, + User->getValueType(0), + O0.getNode(), O1.getNode()); + }; + + SDValue TrueRes = TryFold(ConstTrue); + if (!TrueRes) + break; + SDValue FalseRes = TryFold(ConstFalse); + if (!FalseRes) + break; + + // For us to materialize these using one instruction, we must be able to + // represent them as signed 16-bit integers. + uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(), + False = cast<ConstantSDNode>(FalseRes)->getZExtValue(); + if (!isInt<16>(True) || !isInt<16>(False)) + break; + + // We can replace User with a new SELECT node, and try again to see if we + // can fold the select with its user. + Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes); + N = User; + ConstTrue = TrueRes; + ConstFalse = FalseRes; + } while (N->hasOneUse()); +} + +void PPCDAGToDAGISel::PreprocessISelDAG() { + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + SDValue Res; + switch (N->getOpcode()) { + default: break; + case ISD::OR: + Res = combineToCMPB(N); + break; + } + + if (!Res) + foldBoolExts(Res, N); + + if (Res) { + DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: "); + DEBUG(N->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(Res.getNode()->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + MadeChange = true; + } + } + + if (MadeChange) + CurDAG->RemoveDeadNodes(); +} + +/// PostprocessISelDAG - Perform some late peephole optimizations +/// on the DAG representation. +void PPCDAGToDAGISel::PostprocessISelDAG() { + + // Skip peepholes at -O0. + if (TM.getOptLevel() == CodeGenOpt::None) + return; + + PeepholePPC64(); + PeepholeCROps(); + PeepholePPC64ZExt(); +} + +// Check if all users of this node will become isel where the second operand +// is the constant zero. If this is so, and if we can negate the condition, +// then we can flip the true and false operands. This will allow the zero to +// be folded with the isel so that we don't need to materialize a register +// containing zero. +bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) { + // If we're not using isel, then this does not matter. + if (!PPCSubTarget->hasISEL()) + return false; + + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (!User->isMachineOpcode()) + return false; + if (User->getMachineOpcode() != PPC::SELECT_I4 && + User->getMachineOpcode() != PPC::SELECT_I8) + return false; + + SDNode *Op2 = User->getOperand(2).getNode(); + if (!Op2->isMachineOpcode()) + return false; + + if (Op2->getMachineOpcode() != PPC::LI && + Op2->getMachineOpcode() != PPC::LI8) + return false; + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0)); + if (!C) + return false; + + if (!C->isNullValue()) + return false; + } + + return true; +} + +void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) { + SmallVector<SDNode *, 4> ToReplace; + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + assert((User->getMachineOpcode() == PPC::SELECT_I4 || + User->getMachineOpcode() == PPC::SELECT_I8) && + "Must have all select users"); + ToReplace.push_back(User); + } + + for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(), + UE = ToReplace.end(); UI != UE; ++UI) { + SDNode *User = *UI; + SDNode *ResNode = + CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User), + User->getValueType(0), User->getOperand(0), + User->getOperand(2), + User->getOperand(1)); + + DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); + DEBUG(User->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(ResNode->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(User, ResNode); + } +} + +void PPCDAGToDAGISel::PeepholeCROps() { + bool IsModified; + do { + IsModified = false; + for (SDNode &Node : CurDAG->allnodes()) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); + if (!MachineNode || MachineNode->use_empty()) + continue; + SDNode *ResNode = MachineNode; + + bool Op1Set = false, Op1Unset = false, + Op1Not = false, + Op2Set = false, Op2Unset = false, + Op2Not = false; + + unsigned Opcode = MachineNode->getMachineOpcode(); + switch (Opcode) { + default: break; + case PPC::CRAND: + case PPC::CRNAND: + case PPC::CROR: + case PPC::CRXOR: + case PPC::CRNOR: + case PPC::CREQV: + case PPC::CRANDC: + case PPC::CRORC: { + SDValue Op = MachineNode->getOperand(1); + if (Op.isMachineOpcode()) { + if (Op.getMachineOpcode() == PPC::CRSET) + Op2Set = true; + else if (Op.getMachineOpcode() == PPC::CRUNSET) + Op2Unset = true; + else if (Op.getMachineOpcode() == PPC::CRNOR && + Op.getOperand(0) == Op.getOperand(1)) + Op2Not = true; + } + } // fallthrough + case PPC::BC: + case PPC::BCn: + case PPC::SELECT_I4: + case PPC::SELECT_I8: + case PPC::SELECT_F4: + case PPC::SELECT_F8: + case PPC::SELECT_QFRC: + case PPC::SELECT_QSRC: + case PPC::SELECT_QBRC: + case PPC::SELECT_VRRC: + case PPC::SELECT_VSFRC: + case PPC::SELECT_VSSRC: + case PPC::SELECT_VSRC: { + SDValue Op = MachineNode->getOperand(0); + if (Op.isMachineOpcode()) { + if (Op.getMachineOpcode() == PPC::CRSET) + Op1Set = true; + else if (Op.getMachineOpcode() == PPC::CRUNSET) + Op1Unset = true; + else if (Op.getMachineOpcode() == PPC::CRNOR && + Op.getOperand(0) == Op.getOperand(1)) + Op1Not = true; + } + } + break; + } + + bool SelectSwap = false; + switch (Opcode) { + default: break; + case PPC::CRAND: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // x & x = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Set) + // 1 & y = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Set) + // x & 1 = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset || Op2Unset) + // x & 0 = 0 & y = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Not) + // ~x & y = andc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0). + getOperand(0)); + else if (Op2Not) + // x & ~y = andc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRNAND: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // nand(x, x) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Set) + // nand(1, y) -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Set) + // nand(x, 1) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Unset || Op2Unset) + // nand(x, 0) = nand(0, y) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Not) + // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // nand(x, ~y) = ~x | y = orc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1). + getOperand(0), + MachineNode->getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CROR: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // x | x = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Set || Op2Set) + // x | 1 = 1 | y = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Unset) + // 0 | y = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Unset) + // x | 0 = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // ~x | y = orc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0). + getOperand(0)); + else if (Op2Not) + // x | ~y = orc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRXOR: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // xor(x, x) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // xor(1, y) -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Set) + // xor(x, 1) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Unset) + // xor(0, y) = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Unset) + // xor(x, 0) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // xor(~x, y) = eqv(x, y) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // xor(x, ~y) = eqv(x, y) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRNOR: + if (Op1Set || Op2Set) + // nor(1, y) -> 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Unset) + // nor(0, y) = ~y -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Unset) + // nor(x, 0) = ~x + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Not) + // nor(~x, y) = andc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // nor(x, ~y) = andc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1). + getOperand(0), + MachineNode->getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CREQV: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // eqv(x, x) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // eqv(1, y) = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Set) + // eqv(x, 1) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset) + // eqv(0, y) = ~y -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Unset) + // eqv(x, 0) = ~x + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Not) + // eqv(~x, y) = xor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // eqv(x, ~y) = xor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRANDC: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // andc(x, x) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // andc(1, y) = ~y + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op1Unset || Op2Set) + // andc(0, y) = andc(x, 1) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op2Unset) + // andc(x, 0) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // andc(~x, y) = ~(x | y) = nor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // andc(x, ~y) = x & y + ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0)), + SelectSwap = true; + break; + case PPC::CRORC: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // orc(x, x) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set || Op2Unset) + // orc(1, y) = orc(x, 0) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op2Set) + // orc(x, 1) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset) + // orc(0, y) = ~y + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op1Not) + // orc(~x, y) = ~(x & y) = nand(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // orc(x, ~y) = x | y + ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0)), + SelectSwap = true; + break; + case PPC::SELECT_I4: + case PPC::SELECT_I8: + case PPC::SELECT_F4: + case PPC::SELECT_F8: + case PPC::SELECT_QFRC: + case PPC::SELECT_QSRC: + case PPC::SELECT_QBRC: + case PPC::SELECT_VRRC: + case PPC::SELECT_VSFRC: + case PPC::SELECT_VSSRC: + case PPC::SELECT_VSRC: + if (Op1Set) + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op1Unset) + ResNode = MachineNode->getOperand(2).getNode(); + else if (Op1Not) + ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(), + SDLoc(MachineNode), + MachineNode->getValueType(0), + MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(2), + MachineNode->getOperand(1)); + break; + case PPC::BC: + case PPC::BCn: + if (Op1Not) + ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn : + PPC::BC, + SDLoc(MachineNode), + MVT::Other, + MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1), + MachineNode->getOperand(2)); + // FIXME: Handle Op1Set, Op1Unset here too. + break; + } + + // If we're inverting this node because it is used only by selects that + // we'd like to swap, then swap the selects before the node replacement. + if (SelectSwap) + SwapAllSelectUsers(MachineNode); + + if (ResNode != MachineNode) { + DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); + DEBUG(MachineNode->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(ResNode->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(MachineNode, ResNode); + IsModified = true; + } + } + if (IsModified) + CurDAG->RemoveDeadNodes(); + } while (IsModified); +} + +// Gather the set of 32-bit operations that are known to have their +// higher-order 32 bits zero, where ToPromote contains all such operations. +static bool PeepholePPC64ZExtGather(SDValue Op32, + SmallPtrSetImpl<SDNode *> &ToPromote) { + if (!Op32.isMachineOpcode()) + return false; + + // First, check for the "frontier" instructions (those that will clear the + // higher-order 32 bits. + + // For RLWINM and RLWNM, we need to make sure that the mask does not wrap + // around. If it does not, then these instructions will clear the + // higher-order bits. + if ((Op32.getMachineOpcode() == PPC::RLWINM || + Op32.getMachineOpcode() == PPC::RLWNM) && + Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // SLW and SRW always clear the higher-order bits. + if (Op32.getMachineOpcode() == PPC::SLW || + Op32.getMachineOpcode() == PPC::SRW) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // For LI and LIS, we need the immediate to be positive (so that it is not + // sign extended). + if (Op32.getMachineOpcode() == PPC::LI || + Op32.getMachineOpcode() == PPC::LIS) { + if (!isUInt<15>(Op32.getConstantOperandVal(0))) + return false; + + ToPromote.insert(Op32.getNode()); + return true; + } + + // LHBRX and LWBRX always clear the higher-order bits. + if (Op32.getMachineOpcode() == PPC::LHBRX || + Op32.getMachineOpcode() == PPC::LWBRX) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // CNTLZW always produces a 64-bit value in [0,32], and so is zero extended. + if (Op32.getMachineOpcode() == PPC::CNTLZW) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // Next, check for those instructions we can look through. + + // Assuming the mask does not wrap around, then the higher-order bits are + // taken directly from the first operand. + if (Op32.getMachineOpcode() == PPC::RLWIMI && + Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) { + SmallPtrSet<SDNode *, 16> ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // For OR, the higher-order bits are zero if that is true for both operands. + // For SELECT_I4, the same is true (but the relevant operand numbers are + // shifted by 1). + if (Op32.getMachineOpcode() == PPC::OR || + Op32.getMachineOpcode() == PPC::SELECT_I4) { + unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0; + SmallPtrSet<SDNode *, 16> ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1)) + return false; + if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1)) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // For ORI and ORIS, we need the higher-order bits of the first operand to be + // zero, and also for the constant to be positive (so that it is not sign + // extended). + if (Op32.getMachineOpcode() == PPC::ORI || + Op32.getMachineOpcode() == PPC::ORIS) { + SmallPtrSet<SDNode *, 16> ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) + return false; + if (!isUInt<15>(Op32.getConstantOperandVal(1))) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // The higher-order bits of AND are zero if that is true for at least one of + // the operands. + if (Op32.getMachineOpcode() == PPC::AND) { + SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2; + bool Op0OK = + PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); + bool Op1OK = + PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2); + if (!Op0OK && !Op1OK) + return false; + + ToPromote.insert(Op32.getNode()); + + if (Op0OK) + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + + if (Op1OK) + ToPromote.insert(ToPromote2.begin(), ToPromote2.end()); + + return true; + } + + // For ANDI and ANDIS, the higher-order bits are zero if either that is true + // of the first operand, or if the second operand is positive (so that it is + // not sign extended). + if (Op32.getMachineOpcode() == PPC::ANDIo || + Op32.getMachineOpcode() == PPC::ANDISo) { + SmallPtrSet<SDNode *, 16> ToPromote1; + bool Op0OK = + PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); + bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1)); + if (!Op0OK && !Op1OK) + return false; + + ToPromote.insert(Op32.getNode()); + + if (Op0OK) + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + + return true; + } + + return false; +} + +void PPCDAGToDAGISel::PeepholePPC64ZExt() { + if (!PPCSubTarget->isPPC64()) + return; + + // When we zero-extend from i32 to i64, we use a pattern like this: + // def : Pat<(i64 (zext i32:$in)), + // (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32), + // 0, 32)>; + // There are several 32-bit shift/rotate instructions, however, that will + // clear the higher-order bits of their output, rendering the RLDICL + // unnecessary. When that happens, we remove it here, and redefine the + // relevant 32-bit operation to be a 64-bit operation. + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + if (N->getMachineOpcode() != PPC::RLDICL) + continue; + + if (N->getConstantOperandVal(1) != 0 || + N->getConstantOperandVal(2) != 32) + continue; + + SDValue ISR = N->getOperand(0); + if (!ISR.isMachineOpcode() || + ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG) + continue; + + if (!ISR.hasOneUse()) + continue; + + if (ISR.getConstantOperandVal(2) != PPC::sub_32) + continue; + + SDValue IDef = ISR.getOperand(0); + if (!IDef.isMachineOpcode() || + IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF) + continue; + + // We now know that we're looking at a canonical i32 -> i64 zext. See if we + // can get rid of it. + + SDValue Op32 = ISR->getOperand(1); + if (!Op32.isMachineOpcode()) + continue; + + // There are some 32-bit instructions that always clear the high-order 32 + // bits, there are also some instructions (like AND) that we can look + // through. + SmallPtrSet<SDNode *, 16> ToPromote; + if (!PeepholePPC64ZExtGather(Op32, ToPromote)) + continue; + + // If the ToPromote set contains nodes that have uses outside of the set + // (except for the original INSERT_SUBREG), then abort the transformation. + bool OutsideUse = false; + for (SDNode *PN : ToPromote) { + for (SDNode *UN : PN->uses()) { + if (!ToPromote.count(UN) && UN != ISR.getNode()) { + OutsideUse = true; + break; + } + } + + if (OutsideUse) + break; + } + if (OutsideUse) + continue; + + MadeChange = true; + + // We now know that this zero extension can be removed by promoting to + // nodes in ToPromote to 64-bit operations, where for operations in the + // frontier of the set, we need to insert INSERT_SUBREGs for their + // operands. + for (SDNode *PN : ToPromote) { + unsigned NewOpcode; + switch (PN->getMachineOpcode()) { + default: + llvm_unreachable("Don't know the 64-bit variant of this instruction"); + case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break; + case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break; + case PPC::SLW: NewOpcode = PPC::SLW8; break; + case PPC::SRW: NewOpcode = PPC::SRW8; break; + case PPC::LI: NewOpcode = PPC::LI8; break; + case PPC::LIS: NewOpcode = PPC::LIS8; break; + case PPC::LHBRX: NewOpcode = PPC::LHBRX8; break; + case PPC::LWBRX: NewOpcode = PPC::LWBRX8; break; + case PPC::CNTLZW: NewOpcode = PPC::CNTLZW8; break; + case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break; + case PPC::OR: NewOpcode = PPC::OR8; break; + case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break; + case PPC::ORI: NewOpcode = PPC::ORI8; break; + case PPC::ORIS: NewOpcode = PPC::ORIS8; break; + case PPC::AND: NewOpcode = PPC::AND8; break; + case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break; + case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break; + } + + // Note: During the replacement process, the nodes will be in an + // inconsistent state (some instructions will have operands with values + // of the wrong type). Once done, however, everything should be right + // again. + + SmallVector<SDValue, 4> Ops; + for (const SDValue &V : PN->ops()) { + if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 && + !isa<ConstantSDNode>(V)) { + SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) }; + SDNode *ReplOp = + CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V), + ISR.getNode()->getVTList(), ReplOpOps); + Ops.push_back(SDValue(ReplOp, 0)); + } else { + Ops.push_back(V); + } + } + + // Because all to-be-promoted nodes only have users that are other + // promoted nodes (or the original INSERT_SUBREG), we can safely replace + // the i32 result value type with i64. + + SmallVector<EVT, 2> NewVTs; + SDVTList VTs = PN->getVTList(); + for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i) + if (VTs.VTs[i] == MVT::i32) + NewVTs.push_back(MVT::i64); + else + NewVTs.push_back(VTs.VTs[i]); + + DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: "); + DEBUG(PN->dump(CurDAG)); + + CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops); + + DEBUG(dbgs() << "\nNew: "); + DEBUG(PN->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + } + + // Now we replace the original zero extend and its associated INSERT_SUBREG + // with the value feeding the INSERT_SUBREG (which has now been promoted to + // return an i64). + + DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: "); + DEBUG(N->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(Op32.getNode()->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(N, Op32.getNode()); + } + + if (MadeChange) + CurDAG->RemoveDeadNodes(); +} + +void PPCDAGToDAGISel::PeepholePPC64() { + // These optimizations are currently supported only for 64-bit SVR4. + if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64()) + return; + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + unsigned FirstOp; + unsigned StorageOpcode = N->getMachineOpcode(); + + switch (StorageOpcode) { + default: continue; + + case PPC::LBZ: + case PPC::LBZ8: + case PPC::LD: + case PPC::LFD: + case PPC::LFS: + case PPC::LHA: + case PPC::LHA8: + case PPC::LHZ: + case PPC::LHZ8: + case PPC::LWA: + case PPC::LWZ: + case PPC::LWZ8: + FirstOp = 0; + break; + + case PPC::STB: + case PPC::STB8: + case PPC::STD: + case PPC::STFD: + case PPC::STFS: + case PPC::STH: + case PPC::STH8: + case PPC::STW: + case PPC::STW8: + FirstOp = 1; + break; + } + + // If this is a load or store with a zero offset, or within the alignment, + // we may be able to fold an add-immediate into the memory operation. + // The check against alignment is below, as it can't occur until we check + // the arguments to N + if (!isa<ConstantSDNode>(N->getOperand(FirstOp))) + continue; + + SDValue Base = N->getOperand(FirstOp + 1); + if (!Base.isMachineOpcode()) + continue; + + // On targets with fusion, we don't want this to fire and remove a fusion + // opportunity, unless a) it results in another fusion opportunity or + // b) optimizing for size. + if (PPCSubTarget->hasFusion() && + (!MF->getFunction()->optForSize() && !Base.hasOneUse())) + continue; + + unsigned Flags = 0; + bool ReplaceFlags = true; + + // When the feeding operation is an add-immediate of some sort, + // determine whether we need to add relocation information to the + // target flags on the immediate operand when we fold it into the + // load instruction. + // + // For something like ADDItocL, the relocation information is + // inferred from the opcode; when we process it in the AsmPrinter, + // we add the necessary relocation there. A load, though, can receive + // relocation from various flavors of ADDIxxx, so we need to carry + // the relocation information in the target flags. + switch (Base.getMachineOpcode()) { + default: continue; + + case PPC::ADDI8: + case PPC::ADDI: + // In some cases (such as TLS) the relocation information + // is already in place on the operand, so copying the operand + // is sufficient. + ReplaceFlags = false; + // For these cases, the immediate may not be divisible by 4, in + // which case the fold is illegal for DS-form instructions. (The + // other cases provide aligned addresses and are always safe.) + if ((StorageOpcode == PPC::LWA || + StorageOpcode == PPC::LD || + StorageOpcode == PPC::STD) && + (!isa<ConstantSDNode>(Base.getOperand(1)) || + Base.getConstantOperandVal(1) % 4 != 0)) + continue; + break; + case PPC::ADDIdtprelL: + Flags = PPCII::MO_DTPREL_LO; + break; + case PPC::ADDItlsldL: + Flags = PPCII::MO_TLSLD_LO; + break; + case PPC::ADDItocL: + Flags = PPCII::MO_TOC_LO; + break; + } + + SDValue ImmOpnd = Base.getOperand(1); + int MaxDisplacement = 0; + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { + const GlobalValue *GV = GA->getGlobal(); + MaxDisplacement = GV->getAlignment() - 1; + } + + int Offset = N->getConstantOperandVal(FirstOp); + if (Offset < 0 || Offset > MaxDisplacement) + continue; + + // We found an opportunity. Reverse the operands from the add + // immediate and substitute them into the load or store. If + // needed, update the target flags for the immediate operand to + // reflect the necessary relocation information. + DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); + DEBUG(Base->dump(CurDAG)); + DEBUG(dbgs() << "\nN: "); + DEBUG(N->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + // If the relocation information isn't already present on the + // immediate operand, add it now. + if (ReplaceFlags) { + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { + SDLoc dl(GA); + const GlobalValue *GV = GA->getGlobal(); + // We can't perform this optimization for data whose alignment + // is insufficient for the instruction encoding. + if (GV->getAlignment() < 4 && + (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD || + StorageOpcode == PPC::LWA || (Offset % 4) != 0)) { + DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n"); + continue; + } + ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags); + } else if (ConstantPoolSDNode *CP = + dyn_cast<ConstantPoolSDNode>(ImmOpnd)) { + const Constant *C = CP->getConstVal(); + ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, + CP->getAlignment(), + Offset, Flags); + } + } + + if (FirstOp == 1) // Store + (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd, + Base.getOperand(0), N->getOperand(3)); + else // Load + (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0), + N->getOperand(2)); + + // The add-immediate may now be dead, in which case remove it. + if (Base.getNode()->use_empty()) + CurDAG->RemoveDeadNode(Base.getNode()); + } +} + + +/// createPPCISelDag - This pass converts a legalized DAG into a +/// PowerPC-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { + return new PPCDAGToDAGISel(TM); +} + +static void initializePassOnce(PassRegistry &Registry) { + const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection"; + PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, + nullptr, false, false); + Registry.registerPass(*PI, true); +} + +void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) { + CALL_ONCE_INITIALIZATION(initializePassOnce); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp new file mode 100644 index 0000000..af9ad07 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -0,0 +1,11590 @@ +//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCISelLowering class. +// +//===----------------------------------------------------------------------===// + +#include "PPCISelLowering.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCCallingConv.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCPerfectShuffle.h" +#include "PPCTargetMachine.h" +#include "PPCTargetObjectFile.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", +cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); + +static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", +cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); + +static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", +cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); + +// FIXME: Remove this once the bug has been fixed! +extern cl::opt<bool> ANDIGlueBug; + +PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, + const PPCSubtarget &STI) + : TargetLowering(TM), Subtarget(STI) { + // Use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + + // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all + // arguments are at least 4/8 bytes aligned. + bool isPPC64 = Subtarget.isPPC64(); + setMinStackArgumentAlignment(isPPC64 ? 8:4); + + // Set up the register classes. + addRegisterClass(MVT::i32, &PPC::GPRCRegClass); + if (!Subtarget.useSoftFloat()) { + addRegisterClass(MVT::f32, &PPC::F4RCRegClass); + addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + } + + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); + } + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // PowerPC has pre-inc load and store's. + setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); + + if (Subtarget.useCRBits()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (isPPC64 || Subtarget.hasFPCVT()) { + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + } else { + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); + } + + // PowerPC does not support direct load / store of condition registers + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::i1, Custom); + + // FIXME: Remove this once the ANDI glue bug is fixed: + if (ANDIGlueBug) + setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setTruncStoreAction(VT, MVT::i1, Expand); + } + + addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); + } + + // This is used in the ppcf128->int sequence. Note it has different semantics + // from FP_ROUND: that rounds to nearest, this rounds to zero. + setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom); + + // We do not currently implement these libm ops for PowerPC. + setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); + setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); + setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); + setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); + setOperationAction(ISD::FREM, MVT::ppcf128, Expand); + + // PowerPC has no SREM/UREM instructions + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + + // We don't support sin/cos/sqrt/fmod/pow + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FPOW , MVT::f64, Expand); + setOperationAction(ISD::FMA , MVT::f64, Legal); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + setOperationAction(ISD::FPOW , MVT::f32, Expand); + setOperationAction(ISD::FMA , MVT::f32, Legal); + + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + + // If we're enabling GP optimizations, use hardware square root + if (!Subtarget.hasFSQRT() && + !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && + Subtarget.hasFRE())) + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + + if (!Subtarget.hasFSQRT() && + !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && + Subtarget.hasFRES())) + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + + if (Subtarget.hasFCPSGN()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); + } else { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + } + + if (Subtarget.hasFPRND()) { + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FROUND, MVT::f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); + } + + // PowerPC does not have BSWAP, CTPOP or CTTZ + setOperationAction(ISD::BSWAP, MVT::i32 , Expand); + setOperationAction(ISD::CTTZ , MVT::i32 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + + if (Subtarget.hasPOPCNTD()) { + setOperationAction(ISD::CTPOP, MVT::i32 , Legal); + setOperationAction(ISD::CTPOP, MVT::i64 , Legal); + } else { + setOperationAction(ISD::CTPOP, MVT::i32 , Expand); + setOperationAction(ISD::CTPOP, MVT::i64 , Expand); + } + + // PowerPC does not have ROTR + setOperationAction(ISD::ROTR, MVT::i32 , Expand); + setOperationAction(ISD::ROTR, MVT::i64 , Expand); + + if (!Subtarget.useCRBits()) { + // PowerPC does not have Select + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::i64, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + } + + // PowerPC wants to turn select_cc of FP into fsel when possible. + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + // PowerPC wants to optimize integer setcc a bit + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SETCC, MVT::i32, Custom); + + // PowerPC does not have BRCOND which requires SetCC + if (!Subtarget.useCRBits()) + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + + // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + + // PowerPC does not have [U|S]INT_TO_FP + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); + } else { + setOperationAction(ISD::BITCAST, MVT::f32, Expand); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); + setOperationAction(ISD::BITCAST, MVT::i64, Expand); + setOperationAction(ISD::BITCAST, MVT::f64, Expand); + } + + // We cannot sextinreg(i1). Expand to shifts. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + + // We want to legalize GlobalAddress and ConstantPool nodes into the + // appropriate instructions to materialize the address. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::BlockAddress, MVT::i64, Custom); + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + + // TRAP is legal. + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // TRAMPOLINE is custom lowered. + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + + if (Subtarget.isSVR4ABI()) { + if (isPPC64) { + // VAARG always uses double-word chunks, so promote anything smaller. + setOperationAction(ISD::VAARG, MVT::i1, Promote); + AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i8, Promote); + AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i16, Promote); + AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i32, Promote); + AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + } else { + // VAARG is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::i64, Custom); + } + } else + setOperationAction(ISD::VAARG, MVT::Other, Expand); + + if (Subtarget.isSVR4ABI() && !isPPC64) + // VACOPY is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + else + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + + // Use the default implementation. + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // To handle counter-based loop conditions. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + + // Comparisons that require checking two conditions. + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + + if (Subtarget.has64BitSupport()) { + // They also have instructions for converting between i64 and fp. + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); + // This is just the low 32 bits of a (signed) fp->i64 conversion. + // We cannot do this with Promote because i64 is not a legal type. + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + + if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + } else { + // PowerPC does not have FP_TO_UINT on 32-bit implementations. + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + } + + // With the instructions enabled under FPCVT, we can do everything. + if (Subtarget.hasFPCVT()) { + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + } + + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + } + + if (Subtarget.use64BitRegs()) { + // 64-bit PowerPC implementations can support i64 types directly + addRegisterClass(MVT::i64, &PPC::G8RCRegClass); + // BUILD_PAIR can't be handled natively, and should be expanded to shl/or + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); + // 64-bit PowerPC wants to expand i128 shifts itself. + setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); + } else { + // 32-bit PowerPC wants to expand i64 shifts itself. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + } + + if (Subtarget.hasAltivec()) { + // First set operation action for all vector types to expand. Then we + // will selectively turn on ones that can be effectively codegen'd. + for (MVT VT : MVT::vector_valuetypes()) { + // add/sub are legal for all supported vector VT's. + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + + // Vector instructions introduced in P8 + if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { + setOperationAction(ISD::CTPOP, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + } + else { + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + // We promote all shuffles to v16i8. + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); + AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); + + // We promote all non-typed operations to v4i32. + setOperationAction(ISD::AND , VT, Promote); + AddPromotedToType (ISD::AND , VT, MVT::v4i32); + setOperationAction(ISD::OR , VT, Promote); + AddPromotedToType (ISD::OR , VT, MVT::v4i32); + setOperationAction(ISD::XOR , VT, Promote); + AddPromotedToType (ISD::XOR , VT, MVT::v4i32); + setOperationAction(ISD::LOAD , VT, Promote); + AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); + setOperationAction(ISD::SELECT_CC, VT, Promote); + AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, MVT::v4i32); + + // No other operations are legal. + setOperationAction(ISD::MUL , VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + } + + // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle + // with merges, splats, etc. + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + + setOperationAction(ISD::AND , MVT::v4i32, Legal); + setOperationAction(ISD::OR , MVT::v4i32, Legal); + setOperationAction(ISD::XOR , MVT::v4i32, Legal); + setOperationAction(ISD::LOAD , MVT::v4i32, Legal); + setOperationAction(ISD::SELECT, MVT::v4i32, + Subtarget.useCRBits() ? Legal : Expand); + setOperationAction(ISD::STORE , MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); + addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); + + setOperationAction(ISD::MUL, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + + if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + } + + if (Subtarget.hasP8Altivec()) + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + else + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + + // Altivec does not contain unordered floating-point compare instructions + setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); + + if (Subtarget.hasVSX()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + if (Subtarget.hasP8Vector()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + } + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + } + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::MUL, MVT::v2f64, Legal); + setOperationAction(ISD::FMA, MVT::v2f64, Legal); + + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); + + // Share the Altivec comparison restrictions. + setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); + + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::STORE, MVT::v2f64, Legal); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); + + if (Subtarget.hasP8Vector()) + addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); + + addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); + + addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); + addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); + addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); + + if (Subtarget.hasP8Altivec()) { + setOperationAction(ISD::SHL, MVT::v2i64, Legal); + setOperationAction(ISD::SRA, MVT::v2i64, Legal); + setOperationAction(ISD::SRL, MVT::v2i64, Legal); + + setOperationAction(ISD::SETCC, MVT::v2i64, Legal); + } + else { + setOperationAction(ISD::SHL, MVT::v2i64, Expand); + setOperationAction(ISD::SRA, MVT::v2i64, Expand); + setOperationAction(ISD::SRL, MVT::v2i64, Expand); + + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + + // VSX v2i64 only supports non-arithmetic operations. + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SUB, MVT::v2i64, Expand); + } + + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); + + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + + // Vector operation legalization checks the result type of + // SIGN_EXTEND_INREG, overall legalization checks the inner type. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + + addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); + } + + if (Subtarget.hasP8Altivec()) { + addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); + addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); + } + } + + if (Subtarget.hasQPX()) { + setOperationAction(ISD::FADD, MVT::v4f64, Legal); + setOperationAction(ISD::FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::FREM, MVT::v4f64, Expand); + + setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); + setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); + + setOperationAction(ISD::LOAD , MVT::v4f64, Custom); + setOperationAction(ISD::STORE , MVT::v4f64, Custom); + + setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4f64, Expand); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); + + setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); + setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); + + setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); + setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); + + setOperationAction(ISD::FNEG , MVT::v4f64, Legal); + setOperationAction(ISD::FABS , MVT::v4f64, Legal); + setOperationAction(ISD::FSIN , MVT::v4f64, Expand); + setOperationAction(ISD::FCOS , MVT::v4f64, Expand); + setOperationAction(ISD::FPOWI , MVT::v4f64, Expand); + setOperationAction(ISD::FPOW , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); + setOperationAction(ISD::FEXP , MVT::v4f64, Expand); + setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); + + setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); + + setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); + + addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FREM, MVT::v4f32, Expand); + + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); + setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); + + setOperationAction(ISD::LOAD , MVT::v4f32, Custom); + setOperationAction(ISD::STORE , MVT::v4f32, Custom); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4f32, Expand); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + + setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); + setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); + + setOperationAction(ISD::FNEG , MVT::v4f32, Legal); + setOperationAction(ISD::FABS , MVT::v4f32, Legal); + setOperationAction(ISD::FSIN , MVT::v4f32, Expand); + setOperationAction(ISD::FCOS , MVT::v4f32, Expand); + setOperationAction(ISD::FPOWI , MVT::v4f32, Expand); + setOperationAction(ISD::FPOW , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); + setOperationAction(ISD::FEXP , MVT::v4f32, Expand); + setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); + + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + + setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); + + addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); + + setOperationAction(ISD::AND , MVT::v4i1, Legal); + setOperationAction(ISD::OR , MVT::v4i1, Legal); + setOperationAction(ISD::XOR , MVT::v4i1, Legal); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4i1, Expand); + setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); + + setOperationAction(ISD::LOAD , MVT::v4i1, Custom); + setOperationAction(ISD::STORE , MVT::v4i1, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + + addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); + + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); + setOperationAction(ISD::FROUND, MVT::v4f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); + + // These need to set FE_INEXACT, and so cannot be vectorized here. + setOperationAction(ISD::FRINT, MVT::v4f64, Expand); + setOperationAction(ISD::FRINT, MVT::v4f32, Expand); + + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + } else { + setOperationAction(ISD::FDIV, MVT::v4f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); + + setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); + } + } + + if (Subtarget.has64BitSupport()) + setOperationAction(ISD::PREFETCH, MVT::Other, Legal); + + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); + + if (!isPPC64) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + } + + setBooleanContents(ZeroOrOneBooleanContent); + + if (Subtarget.hasAltivec()) { + // Altivec instructions set fields to all zeros or all ones. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + } + + if (!isPPC64) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + + setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SINT_TO_FP); + if (Subtarget.hasFPCVT()) + setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::BR_CC); + if (Subtarget.useCRBits()) + setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BSWAP); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + + if (Subtarget.useCRBits()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::SELECT_CC); + } + + // Use reciprocal estimates. + if (TM.Options.UnsafeFPMath) { + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::FSQRT); + } + + // Darwin long double math library functions have $LDBL128 appended. + if (Subtarget.isDarwin()) { + setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); + setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); + setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); + setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); + setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); + setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); + setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); + setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); + setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); + setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); + } + + // With 32 condition bits, we don't need to sink (and duplicate) compares + // aggressively in CodeGenPrep. + if (Subtarget.useCRBits()) { + setHasMultipleConditionRegisters(); + setJumpIsExpensive(); + } + + setMinFunctionAlignment(2); + if (Subtarget.isDarwin()) + setPrefFunctionAlignment(4); + + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + setPrefFunctionAlignment(4); + setPrefLoopAlignment(4); + break; + } + + setInsertFencesForAtomic(true); + + if (Subtarget.enableMachineScheduler()) + setSchedulingPreference(Sched::Source); + else + setSchedulingPreference(Sched::Hybrid); + + computeRegisterProperties(STI.getRegisterInfo()); + + // The Freescale cores do better with aggressive inlining of memcpy and + // friends. GCC uses same threshold of 128 bytes (= 32 word stores). + if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || + Subtarget.getDarwinDirective() == PPC::DIR_E5500) { + MaxStoresPerMemset = 32; + MaxStoresPerMemsetOptSize = 16; + MaxStoresPerMemcpy = 32; + MaxStoresPerMemcpyOptSize = 8; + MaxStoresPerMemmove = 32; + MaxStoresPerMemmoveOptSize = 8; + } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { + // The A2 also benefits from (very) aggressive inlining of memcpy and + // friends. The overhead of a the function call, even when warm, can be + // over one hundred cycles. + MaxStoresPerMemset = 128; + MaxStoresPerMemcpy = 128; + MaxStoresPerMemmove = 128; + } +} + +/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, + unsigned MaxMaxAlign) { + if (MaxAlign == MaxMaxAlign) + return; + if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { + if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) + MaxAlign = 32; + else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + for (auto *EltTy : STy->elements()) { + unsigned EltAlign = 0; + getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == MaxMaxAlign) + break; + } + } +} + +/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. +unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { + // Darwin passes everything on 4 byte boundary. + if (Subtarget.isDarwin()) + return 4; + + // 16byte and wider vectors are passed on 16byte boundary. + // The rest is 8 on PPC64 and 4 on PPC32 boundary. + unsigned Align = Subtarget.isPPC64() ? 8 : 4; + if (Subtarget.hasAltivec() || Subtarget.hasQPX()) + getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); + return Align; +} + +bool PPCTargetLowering::useSoftFloat() const { + return Subtarget.useSoftFloat(); +} + +const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((PPCISD::NodeType)Opcode) { + case PPCISD::FIRST_NUMBER: break; + case PPCISD::FSEL: return "PPCISD::FSEL"; + case PPCISD::FCFID: return "PPCISD::FCFID"; + case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; + case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; + case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; + case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; + case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; + case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; + case PPCISD::FRE: return "PPCISD::FRE"; + case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; + case PPCISD::STFIWX: return "PPCISD::STFIWX"; + case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; + case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; + case PPCISD::VPERM: return "PPCISD::VPERM"; + case PPCISD::CMPB: return "PPCISD::CMPB"; + case PPCISD::Hi: return "PPCISD::Hi"; + case PPCISD::Lo: return "PPCISD::Lo"; + case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; + case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; + case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; + case PPCISD::SRL: return "PPCISD::SRL"; + case PPCISD::SRA: return "PPCISD::SRA"; + case PPCISD::SHL: return "PPCISD::SHL"; + case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; + case PPCISD::CALL: return "PPCISD::CALL"; + case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; + case PPCISD::MTCTR: return "PPCISD::MTCTR"; + case PPCISD::BCTRL: return "PPCISD::BCTRL"; + case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; + case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; + case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; + case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; + case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; + case PPCISD::MFVSR: return "PPCISD::MFVSR"; + case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; + case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; + case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; + case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; + case PPCISD::VCMP: return "PPCISD::VCMP"; + case PPCISD::VCMPo: return "PPCISD::VCMPo"; + case PPCISD::LBRX: return "PPCISD::LBRX"; + case PPCISD::STBRX: return "PPCISD::STBRX"; + case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; + case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; + case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; + case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; + case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; + case PPCISD::BDNZ: return "PPCISD::BDNZ"; + case PPCISD::BDZ: return "PPCISD::BDZ"; + case PPCISD::MFFS: return "PPCISD::MFFS"; + case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; + case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; + case PPCISD::CR6SET: return "PPCISD::CR6SET"; + case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; + case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; + case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; + case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; + case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; + case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; + case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; + case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; + case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; + case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; + case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; + case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; + case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; + case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; + case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; + case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; + case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; + case PPCISD::SC: return "PPCISD::SC"; + case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; + case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; + case PPCISD::RFEBB: return "PPCISD::RFEBB"; + case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; + case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; + case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; + case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; + case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; + case PPCISD::QBFLT: return "PPCISD::QBFLT"; + case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; + } + return nullptr; +} + +EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, + EVT VT) const { + if (!VT.isVector()) + return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; + + if (Subtarget.hasQPX()) + return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); + + return VT.changeVectorElementTypeToInteger(); +} + +bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { + assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); + return true; +} + +//===----------------------------------------------------------------------===// +// Node matching predicates, for use by the tblgen matching code. +//===----------------------------------------------------------------------===// + +/// isFloatingPointZero - Return true if this is 0.0 or -0.0. +static bool isFloatingPointZero(SDValue Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->getValueAPF().isZero(); + else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { + // Maybe this has already been legalized into the constant pool? + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->getValueAPF().isZero(); + } + return false; +} + +/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return +/// true if Op is undef or if it matches the specified value. +static bool isConstantOrUndef(int Op, int Val) { + return Op < 0 || Op == Val; +} + +/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUHUM instruction. +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operations with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + bool IsLE = DAG.getDataLayout().isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) + return false; + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 1; + for (unsigned i = 0; i != 8; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) + return false; + } + return true; +} + +/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUWUM instruction. +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operations with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + bool IsLE = DAG.getDataLayout().isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) + return false; + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 2; + for (unsigned i = 0; i != 8; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) + return false; + } + return true; +} + +/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the +/// current subtarget. +/// +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operations with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + const PPCSubtarget& Subtarget = + static_cast<const PPCSubtarget&>(DAG.getSubtarget()); + if (!Subtarget.hasP8Vector()) + return false; + + bool IsLE = DAG.getDataLayout().isLittleEndian(); + if (ShuffleKind == 0) { + if (IsLE) + return false; + for (unsigned i = 0; i != 16; i += 4) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || + !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || + !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) + return false; + } else if (ShuffleKind == 2) { + if (!IsLE) + return false; + for (unsigned i = 0; i != 16; i += 4) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || + !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || + !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = IsLE ? 0 : 4; + for (unsigned i = 0; i != 8; i += 4) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || + !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || + !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) + return false; + } + return true; +} + +/// isVMerge - Common function, used to match vmrg* shuffles. +/// +static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned LHSStart, unsigned RHSStart) { + if (N->getValueType(0) != MVT::v16i8) + return false; + assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && + "Unsupported merge size!"); + + for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units + for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit + if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), + LHSStart+j+i*UnitSize) || + !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), + RHSStart+j+i*UnitSize)) + return false; + } + return true; +} + +/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for +/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getDataLayout().isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } +} + +/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for +/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getDataLayout().isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } +} + +/** + * \brief Common function used to match vmrgew and vmrgow shuffles + * + * The indexOffset determines whether to look for even or odd words in + * the shuffle mask. This is based on the of the endianness of the target + * machine. + * - Little Endian: + * - Use offset of 0 to check for odd elements + * - Use offset of 4 to check for even elements + * - Big Endian: + * - Use offset of 0 to check for even elements + * - Use offset of 4 to check for odd elements + * A detailed description of the vector element ordering for little endian and + * big endian can be found at + * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html + * Targeting your applications - what little endian and big endian IBM XL C/C++ + * compiler differences mean to you + * + * The mask to the shuffle vector instruction specifies the indices of the + * elements from the two input vectors to place in the result. The elements are + * numbered in array-access order, starting with the first vector. These vectors + * are always of type v16i8, thus each vector will contain 16 elements of size + * 8. More info on the shuffle vector can be found in the + * http://llvm.org/docs/LangRef.html#shufflevector-instruction + * Language Reference. + * + * The RHSStartValue indicates whether the same input vectors are used (unary) + * or two different input vectors are used, based on the following: + * - If the instruction uses the same vector for both inputs, the range of the + * indices will be 0 to 15. In this case, the RHSStart value passed should + * be 0. + * - If the instruction has two different vectors then the range of the + * indices will be 0 to 31. In this case, the RHSStart value passed should + * be 16 (indices 0-15 specify elements in the first vector while indices 16 + * to 31 specify elements in the second vector). + * + * \param[in] N The shuffle vector SD Node to analyze + * \param[in] IndexOffset Specifies whether to look for even or odd elements + * \param[in] RHSStartValue Specifies the starting index for the righthand input + * vector to the shuffle_vector instruction + * \return true iff this shuffle vector represents an even or odd word merge + */ +static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, + unsigned RHSStartValue) { + if (N->getValueType(0) != MVT::v16i8) + return false; + + for (unsigned i = 0; i < 2; ++i) + for (unsigned j = 0; j < 4; ++j) + if (!isConstantOrUndef(N->getMaskElt(i*4+j), + i*RHSStartValue+j+IndexOffset) || + !isConstantOrUndef(N->getMaskElt(i*4+j+8), + i*RHSStartValue+j+IndexOffset+8)) + return false; + return true; +} + +/** + * \brief Determine if the specified shuffle mask is suitable for the vmrgew or + * vmrgow instructions. + * + * \param[in] N The shuffle vector SD Node to analyze + * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) + * \param[in] ShuffleKind Identify the type of merge: + * - 0 = big-endian merge with two different inputs; + * - 1 = either-endian merge with two identical inputs; + * - 2 = little-endian merge with two different inputs (inputs are swapped for + * little-endian merges). + * \param[in] DAG The current SelectionDAG + * \return true iff this shuffle mask + */ +bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getDataLayout().isLittleEndian()) { + unsigned indexOffset = CheckEven ? 4 : 0; + if (ShuffleKind == 1) // Unary + return isVMerge(N, indexOffset, 0); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, indexOffset, 16); + else + return false; + } + else { + unsigned indexOffset = CheckEven ? 0 : 4; + if (ShuffleKind == 1) // Unary + return isVMerge(N, indexOffset, 0); + else if (ShuffleKind == 0) // Normal + return isVMerge(N, indexOffset, 16); + else + return false; + } + return false; +} + +/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift +/// amount, otherwise return -1. +/// The ShuffleKind distinguishes between big-endian operations with two +/// different inputs (0), either-endian operations with two identical inputs +/// (1), and little-endian operations with two different inputs (2). For the +/// latter, the input operands are swapped (see PPCInstrAltivec.td). +int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::v16i8) + return -1; + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + + // Find the first non-undef value in the shuffle mask. + unsigned i; + for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) + /*search*/; + + if (i == 16) return -1; // all undef. + + // Otherwise, check to see if the rest of the elements are consecutively + // numbered from this value. + unsigned ShiftAmt = SVOp->getMaskElt(i); + if (ShiftAmt < i) return -1; + + ShiftAmt -= i; + bool isLE = DAG.getDataLayout().isLittleEndian(); + + if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + } else if (ShuffleKind == 1) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) + return -1; + } else + return -1; + + if (isLE) + ShiftAmt = 16 - ShiftAmt; + + return ShiftAmt; +} + +/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a splat of a single element that is suitable for input to +/// VSPLTB/VSPLTH/VSPLTW. +bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { + assert(N->getValueType(0) == MVT::v16i8 && + (EltSize == 1 || EltSize == 2 || EltSize == 4)); + + // The consecutive indices need to specify an element, not part of two + // different elements. So abandon ship early if this isn't the case. + if (N->getMaskElt(0) % EltSize != 0) + return false; + + // This is a splat operation if each element of the permute is the same, and + // if the value doesn't reference the second vector. + unsigned ElementBase = N->getMaskElt(0); + + // FIXME: Handle UNDEF elements too! + if (ElementBase >= 16) + return false; + + // Check that the indices are consecutive, in the case of a multi-byte element + // splatted with a v16i8 mask. + for (unsigned i = 1; i != EltSize; ++i) + if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) + return false; + + for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { + if (N->getMaskElt(i) < 0) continue; + for (unsigned j = 0; j != EltSize; ++j) + if (N->getMaskElt(i+j) != N->getMaskElt(j)) + return false; + } + return true; +} + +/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the +/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. +unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + assert(isSplatShuffleMask(SVOp, EltSize)); + if (DAG.getDataLayout().isLittleEndian()) + return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); + else + return SVOp->getMaskElt(0) / EltSize; +} + +/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed +/// by using a vspltis[bhw] instruction of the specified element size, return +/// the constant being splatted. The ByteSize field indicates the number of +/// bytes of each element [124] -> [bhw]. +SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + SDValue OpVal(nullptr, 0); + + // If ByteSize of the splat is bigger than the element size of the + // build_vector, then we have a case where we are checking for a splat where + // multiple elements of the buildvector are folded together into a single + // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). + unsigned EltSize = 16/N->getNumOperands(); + if (EltSize < ByteSize) { + unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. + SDValue UniquedVals[4]; + assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); + + // See if all of the elements in the buildvector agree across. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + // If the element isn't a constant, bail fully out. + if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); + + + if (!UniquedVals[i&(Multiple-1)].getNode()) + UniquedVals[i&(Multiple-1)] = N->getOperand(i); + else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) + return SDValue(); // no match. + } + + // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains + // either constant or undef values that are identical for each chunk. See + // if these chunks can form into a larger vspltis*. + + // Check to see if all of the leading entries are either 0 or -1. If + // neither, then this won't fit into the immediate field. + bool LeadingZero = true; + bool LeadingOnes = true; + for (unsigned i = 0; i != Multiple-1; ++i) { + if (!UniquedVals[i].getNode()) continue; // Must have been undefs. + + LeadingZero &= isNullConstant(UniquedVals[i]); + LeadingOnes &= isAllOnesConstant(UniquedVals[i]); + } + // Finally, check the least significant entry. + if (LeadingZero) { + if (!UniquedVals[Multiple-1].getNode()) + return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef + int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); + if (Val < 16) // 0,0,0,4 -> vspltisw(4) + return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); + } + if (LeadingOnes) { + if (!UniquedVals[Multiple-1].getNode()) + return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef + int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); + if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) + return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); + } + + return SDValue(); + } + + // Check to see if this buildvec has a single non-undef value in its elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + if (!OpVal.getNode()) + OpVal = N->getOperand(i); + else if (OpVal != N->getOperand(i)) + return SDValue(); + } + + if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. + + unsigned ValSizeInBytes = EltSize; + uint64_t Value = 0; + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { + Value = CN->getZExtValue(); + } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { + assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); + Value = FloatToBits(CN->getValueAPF().convertToFloat()); + } + + // If the splat value is larger than the element value, then we can never do + // this splat. The only case that we could fit the replicated bits into our + // immediate field for would be zero, and we prefer to use vxor for it. + if (ValSizeInBytes < ByteSize) return SDValue(); + + // If the element value is larger than the splat value, check if it consists + // of a repeated bit pattern of size ByteSize. + if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) + return SDValue(); + + // Properly sign extend the value. + int MaskVal = SignExtend32(Value, ByteSize * 8); + + // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. + if (MaskVal == 0) return SDValue(); + + // Finally, if this value fits in a 5 bit sext field, return it + if (SignExtend32<5>(MaskVal) == MaskVal) + return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); + return SDValue(); +} + +/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift +/// amount, otherwise return -1. +int PPC::isQVALIGNIShuffleMask(SDNode *N) { + EVT VT = N->getValueType(0); + if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) + return -1; + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + + // Find the first non-undef value in the shuffle mask. + unsigned i; + for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) + /*search*/; + + if (i == 4) return -1; // all undef. + + // Otherwise, check to see if the rest of the elements are consecutively + // numbered from this value. + unsigned ShiftAmt = SVOp->getMaskElt(i); + if (ShiftAmt < i) return -1; + ShiftAmt -= i; + + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 4; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + + return ShiftAmt; +} + +//===----------------------------------------------------------------------===// +// Addressing Mode Selection +//===----------------------------------------------------------------------===// + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +} +static bool isIntS16Immediate(SDValue Op, short &Imm) { + return isIntS16Immediate(Op.getNode(), Imm); +} + +/// SelectAddressRegReg - Given the specified addressed, check to see if it +/// can be represented as an indexed [r+r] operation. Returns false if it +/// can be more efficiently represented with [r+imm]. +bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + short imm = 0; + if (N.getOpcode() == ISD::ADD) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i + if (N.getOperand(1).getOpcode() == PPCISD::Lo) + return false; // r+i + + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } else if (N.getOpcode() == ISD::OR) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i can fold it if we can. + + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are provably + // disjoint. + APInt LHSKnownZero, LHSKnownOne; + APInt RHSKnownZero, RHSKnownOne; + DAG.computeKnownBits(N.getOperand(0), + LHSKnownZero, LHSKnownOne); + + if (LHSKnownZero.getBoolValue()) { + DAG.computeKnownBits(N.getOperand(1), + RHSKnownZero, RHSKnownOne); + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + if (~(LHSKnownZero | RHSKnownZero) == 0) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + } + } + + return false; +} + +// If we happen to be doing an i64 load or store into a stack slot that has +// less than a 4-byte alignment, then the frame-index elimination may need to +// use an indexed load or store instruction (because the offset may not be a +// multiple of 4). The extra register needed to hold the offset comes from the +// register scavenger, and it is possible that the scavenger will need to use +// an emergency spill slot. As a result, we need to make sure that a spill slot +// is allocated when doing an i64 load/store into a less-than-4-byte-aligned +// stack slot. +static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { + // FIXME: This does not handle the LWA case. + if (VT != MVT::i64) + return; + + // NOTE: We'll exclude negative FIs here, which come from argument + // lowering, because there are no known test cases triggering this problem + // using packed structures (or similar). We can remove this exclusion if + // we find such a test case. The reason why this is so test-case driven is + // because this entire 'fixup' is only to prevent crashes (from the + // register scavenger) on not-really-valid inputs. For example, if we have: + // %a = alloca i1 + // %b = bitcast i1* %a to i64* + // store i64* a, i64 b + // then the store should really be marked as 'align 1', but is not. If it + // were marked as 'align 1' then the indexed form would have been + // instruction-selected initially, and the problem this 'fixup' is preventing + // won't happen regardless. + if (FrameIdx < 0) + return; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + if (Align >= 4) + return; + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasNonRISpills(); +} + +/// Returns true if the address N can be represented by a base register plus +/// a signed 16-bit displacement [r+imm], and if it is not better +/// represented as reg+reg. If Aligned is true, only accept displacements +/// suitable for STD and friends, i.e. multiples of 4. +bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, + SDValue &Base, + SelectionDAG &DAG, + bool Aligned) const { + // FIXME dl should come from parent load or store, not from address + SDLoc dl(N); + // If this can be more profitably realized as r+r, fail. + if (SelectAddressRegReg(N, Disp, Base, DAG)) + return false; + + if (N.getOpcode() == ISD::ADD) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { + Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + return true; // [r+i] + } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { + // Match LOAD (ADD (X, Lo(G))). + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() + && "Cannot handle constant offsets yet!"); + Disp = N.getOperand(1).getOperand(0); // The global address. + assert(Disp.getOpcode() == ISD::TargetGlobalAddress || + Disp.getOpcode() == ISD::TargetGlobalTLSAddress || + Disp.getOpcode() == ISD::TargetConstantPool || + Disp.getOpcode() == ISD::TargetJumpTable); + Base = N.getOperand(0); + return true; // [&g+r] + } + } else if (N.getOpcode() == ISD::OR) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + APInt LHSKnownZero, LHSKnownOne; + DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); + + if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + if (FrameIndexSDNode *FI = + dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); + return true; + } + } + } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + // Loading from a constant address. + + // If this address fits entirely in a 16-bit sext immediate field, codegen + // this as "d, 0" + short Imm; + if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + CN->getValueType(0)); + return true; + } + + // Handle 32-bit sext immediates with LIS + addr mode. + if ((CN->getValueType(0) == MVT::i32 || + (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && + (!Aligned || (CN->getZExtValue() & 3) == 0)) { + int Addr = (int)CN->getZExtValue(); + + // Otherwise, break this down into an LIS + disp. + Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); + + Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, + MVT::i32); + unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; + Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); + return true; + } + } + + Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else + Base = N; + return true; // [r+0] +} + +/// SelectAddressRegRegOnly - Given the specified addressed, force it to be +/// represented as an indexed [r+r] operation. +bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + // Check to see if we can easily represent this as an [r+r] address. This + // will fail if it thinks that the address is more profitably represented as + // reg+imm, e.g. where imm = 0. + if (SelectAddressRegReg(N, Base, Index, DAG)) + return true; + + // If the operand is an addition, always emit this as [r+r], since this is + // better (for code size, and execution, as the memop does the add for free) + // than emitting an explicit add. + if (N.getOpcode() == ISD::ADD) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + + // Otherwise, do it the hard way, using R0 as the base register. + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + N.getValueType()); + Index = N; + return true; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (DisablePPCPreinc) return false; + + bool isLoad = true; + SDValue Ptr; + EVT VT; + unsigned Alignment; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + Alignment = LD->getAlignment(); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + Alignment = ST->getAlignment(); + isLoad = false; + } else + return false; + + // PowerPC doesn't have preinc load/store instructions for vectors (except + // for QPX, which does have preinc r+r forms). + if (VT.isVector()) { + if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { + return false; + } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { + AM = ISD::PRE_INC; + return true; + } + } + + if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { + + // Common code will reject creating a pre-inc form if the base pointer + // is a frame index, or if N is a store and the base pointer is either + // the same as or a predecessor of the value being stored. Check for + // those situations here, and try with swapped Base/Offset instead. + bool Swap = false; + + if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) + Swap = true; + else if (!isLoad) { + SDValue Val = cast<StoreSDNode>(N)->getValue(); + if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) + Swap = true; + } + + if (Swap) + std::swap(Base, Offset); + + AM = ISD::PRE_INC; + return true; + } + + // LDU/STU can only handle immediates that are a multiple of 4. + if (VT != MVT::i64) { + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + return false; + } else { + // LDU/STU need an address with at least 4-byte alignment. + if (Alignment < 4) + return false; + + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + return false; + } + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of + // sext i32 to i64 when addr mode is r+i. + if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && + LD->getExtensionType() == ISD::SEXTLOAD && + isa<ConstantSDNode>(Offset)) + return false; + } + + AM = ISD::PRE_INC; + return true; +} + +//===----------------------------------------------------------------------===// +// LowerOperation implementation +//===----------------------------------------------------------------------===// + +/// GetLabelAccessInfo - Return true if we should reference labels using a +/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags. +static bool GetLabelAccessInfo(const TargetMachine &TM, + const PPCSubtarget &Subtarget, + unsigned &HiOpFlags, unsigned &LoOpFlags, + const GlobalValue *GV = nullptr) { + HiOpFlags = PPCII::MO_HA; + LoOpFlags = PPCII::MO_LO; + + // Don't use the pic base if not in PIC relocation model. + bool isPIC = TM.getRelocationModel() == Reloc::PIC_; + + if (isPIC) { + HiOpFlags |= PPCII::MO_PIC_FLAG; + LoOpFlags |= PPCII::MO_PIC_FLAG; + } + + // If this is a reference to a global value that requires a non-lazy-ptr, make + // sure that instruction lowering adds it. + if (GV && Subtarget.hasLazyResolverStub(GV)) { + HiOpFlags |= PPCII::MO_NLP_FLAG; + LoOpFlags |= PPCII::MO_NLP_FLAG; + + if (GV->hasHiddenVisibility()) { + HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; + LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; + } + } + + return isPIC; +} + +static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, + SelectionDAG &DAG) { + SDLoc DL(HiPart); + EVT PtrVT = HiPart.getValueType(); + SDValue Zero = DAG.getConstant(0, DL, PtrVT); + + SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); + SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); + + // With PIC, the first instruction is actually "GR+hi(&G)". + if (isPIC) + Hi = DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); + + // Generate non-pic code that has direct accesses to the constant pool. + // The address of the global is just (hi(&g)+lo(&g)). + return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); +} + +static void setUsesTOCBasePtr(MachineFunction &MF) { + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setUsesTOCBasePtr(); +} + +static void setUsesTOCBasePtr(SelectionDAG &DAG) { + setUsesTOCBasePtr(DAG.getMachineFunction()); +} + +static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit, + SDValue GA) { + EVT VT = Is64Bit ? MVT::i64 : MVT::i32; + SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : + DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); + + SDValue Ops[] = { GA, Reg }; + return DAG.getMemIntrinsicNode( + PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, + false, 0); +} + +SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + const Constant *C = CP->getConstVal(); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); + return getTOCEntry(DAG, SDLoc(CP), true, GA); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), + PPCII::MO_PIC_FLAG); + return getTOCEntry(DAG, SDLoc(CP), false, GA); + } + + SDValue CPIHi = + DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); + SDValue CPILo = + DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); + return LowerLabelRef(CPIHi, CPILo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + return getTOCEntry(DAG, SDLoc(JT), true, GA); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + PPCII::MO_PIC_FLAG); + return getTOCEntry(DAG, SDLoc(GA), false, GA); + } + + SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); + SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); + return LowerLabelRef(JTIHi, JTILo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op); + const BlockAddress *BA = BASDN->getBlockAddress(); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual BlockAddress is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); + SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); + return getTOCEntry(DAG, SDLoc(BASDN), true, GA); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag); + SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); + SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); + return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + + // FIXME: TLS addresses currently use medium model code sequences, + // which is the most useful form. Eventually support for small and + // large models could be added if users need it, at the cost of + // additional complexity. + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + + SDLoc dl(GA); + const GlobalValue *GV = GA->getGlobal(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + bool is64bit = Subtarget.isPPC64(); + const Module *M = DAG.getMachineFunction().getFunction()->getParent(); + PICLevel::Level picLevel = M->getPICLevel(); + + TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + + if (Model == TLSModel::LocalExec) { + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL_HA); + SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL_LO); + SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, + is64bit ? MVT::i64 : MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); + return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); + } + + if (Model == TLSModel::InitialExec) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TLS); + SDValue GOTPtr; + if (is64bit) { + setUsesTOCBasePtr(DAG); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, + PtrVT, GOTReg, TGA); + } else + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, + PtrVT, TGA, GOTPtr); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); + } + + if (Model == TLSModel::GeneralDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTPtr; + if (is64bit) { + setUsesTOCBasePtr(DAG); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + if (picLevel == PICLevel::Small) + GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); + else + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } + return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, + GOTPtr, TGA, TGA); + } + + if (Model == TLSModel::LocalDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTPtr; + if (is64bit) { + setUsesTOCBasePtr(DAG); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + if (picLevel == PICLevel::Small) + GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); + else + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } + SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, + PtrVT, GOTPtr, TGA, TGA); + SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, + PtrVT, TLSAddr, TGA); + return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); + } + + llvm_unreachable("Unknown TLS model!"); +} + +SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); + SDLoc DL(GSDN); + const GlobalValue *GV = GSDN->getGlobal(); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + setUsesTOCBasePtr(DAG); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); + return getTOCEntry(DAG, DL, true, GA); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = + GetLabelAccessInfo(DAG.getTarget(), Subtarget, MOHiFlag, MOLoFlag, GV); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, + GSDN->getOffset(), + PPCII::MO_PIC_FLAG); + return getTOCEntry(DAG, DL, false, GA); + } + + SDValue GAHi = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); + SDValue GALo = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); + + SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG); + + // If the global reference is actually to a non-lazy-pointer, we have to do an + // extra load to get the address of the global. + if (MOHiFlag & PPCII::MO_NLP_FLAG) + Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(), + false, false, false, 0); + return Ptr; +} + +SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + SDLoc dl(Op); + + if (Op.getValueType() == MVT::v2i64) { + // When the operands themselves are v2i64 values, we need to do something + // special because VSX has no underlying comparison operations for these. + if (Op.getOperand(0).getValueType() == MVT::v2i64) { + // Equality can be handled by casting to the legal type for Altivec + // comparisons, everything else needs to be expanded. + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getSetCC(dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), + CC)); + } + + return SDValue(); + } + + // We handle most of these in the usual way. + return Op; + } + + // If we're comparing for equality to zero, expose the fact that this is + // implented as a ctlz/srl pair on ppc, so that the dag combiner can + // fold the new nodes. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (C->isNullValue() && CC == ISD::SETEQ) { + EVT VT = Op.getOperand(0).getValueType(); + SDValue Zext = Op.getOperand(0); + if (VT.bitsLT(MVT::i32)) { + VT = MVT::i32; + Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0)); + } + unsigned Log2b = Log2_32(VT.getSizeInBits()); + SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext); + SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz, + DAG.getConstant(Log2b, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc); + } + // Leave comparisons against 0 and -1 alone for now, since they're usually + // optimized. FIXME: revisit this when we can custom lower all setcc + // optimizations. + if (C->isAllOnesValue() || C->isNullValue()) + return SDValue(); + } + + // If we have an integer seteq/setne, turn it into a compare against zero + // by xor'ing the rhs with the lhs, which is faster than setting a + // condition register, reading it back out, and masking the correct bit. The + // normal approach here uses sub to do this instead of xor. Using xor exposes + // the result to other bit-twiddling opportunities. + EVT LHSVT = Op.getOperand(0).getValueType(); + if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + EVT VT = Op.getValueType(); + SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); + } + return SDValue(); +} + +SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + SDNode *Node = Op.getNode(); + EVT VT = Node->getValueType(0); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue InChain = Node->getOperand(0); + SDValue VAListPtr = Node->getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); + SDLoc dl(Node); + + assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); + + // gpr_index + SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, + VAListPtr, MachinePointerInfo(SV), MVT::i8, + false, false, false, 0); + InChain = GprIndex.getValue(1); + + if (VT == MVT::i64) { + // Check if GprIndex is even + SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, + DAG.getConstant(1, dl, MVT::i32)); + SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, + DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); + SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, + DAG.getConstant(1, dl, MVT::i32)); + // Align GprIndex to be even if it isn't + GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, + GprIndex); + } + + // fpr index is 1 byte after gpr + SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(1, dl, MVT::i32)); + + // fpr + SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, + FprPtr, MachinePointerInfo(SV), MVT::i8, + false, false, false, 0); + InChain = FprIndex.getValue(1); + + SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(8, dl, MVT::i32)); + + SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(4, dl, MVT::i32)); + + // areas + SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, + MachinePointerInfo(), false, false, + false, 0); + InChain = OverflowArea.getValue(1); + + SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, + MachinePointerInfo(), false, false, + false, 0); + InChain = RegSaveArea.getValue(1); + + // select overflow_area if index > 8 + SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); + + // adjustment constant gpr_index * 4/8 + SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, + VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(VT.isInteger() ? 4 : 8, dl, + MVT::i32)); + + // OurReg = RegSaveArea + RegConstant + SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, + RegConstant); + + // Floating types are 32 bytes into RegSaveArea + if (VT.isFloatingPoint()) + OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, + DAG.getConstant(32, dl, MVT::i32)); + + // increase {f,g}pr_index by 1 (or 2 if VT is i64) + SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, + VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, + MVT::i32)); + + InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, + VT.isInteger() ? VAListPtr : FprPtr, + MachinePointerInfo(SV), + MVT::i8, false, false, 0); + + // determine if we should load from reg_save_area or overflow_area + SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); + + // increase overflow_area by 4/8 if gpr/fpr > 8 + SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, + DAG.getConstant(VT.isInteger() ? 4 : 8, + dl, MVT::i32)); + + OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, + OverflowAreaPlusN); + + InChain = DAG.getTruncStore(InChain, dl, OverflowArea, + OverflowAreaPtr, + MachinePointerInfo(), + MVT::i32, false, false, 0); + + return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), + false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); + + // We have to copy the entire va_list struct: + // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte + return DAG.getMemcpy(Op.getOperand(0), Op, + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, + false, MachinePointerInfo(), MachinePointerInfo()); +} + +SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + return Op.getOperand(0); +} + +SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Trmp = Op.getOperand(1); // trampoline + SDValue FPtr = Op.getOperand(2); // nested function + SDValue Nest = Op.getOperand(3); // 'nest' parameter value + SDLoc dl(Op); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + bool isPPC64 = (PtrVT == MVT::i64); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + + Entry.Ty = IntPtrTy; + Entry.Node = Trmp; Args.push_back(Entry); + + // TrampSize == (isPPC64 ? 48 : 40); + Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, + isPPC64 ? MVT::i64 : MVT::i32); + Args.push_back(Entry); + + Entry.Node = FPtr; Args.push_back(Entry); + Entry.Node = Nest; Args.push_back(Entry); + + // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__trampoline_setup", PtrVT), + std::move(Args), 0); + + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + return CallResult.second; +} + +SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + MachineFunction &MF = DAG.getMachineFunction(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + SDLoc dl(Op); + + if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), + false, false, 0); + } + + // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. + // We suppose the given va_list is already allocated. + // + // typedef struct { + // char gpr; /* index into the array of 8 GPRs + // * stored in the register save area + // * gpr=0 corresponds to r3, + // * gpr=1 to r4, etc. + // */ + // char fpr; /* index into the array of 8 FPRs + // * stored in the register save area + // * fpr=0 corresponds to f1, + // * fpr=1 to f2, etc. + // */ + // char *overflow_arg_area; + // /* location on stack that holds + // * the next overflow argument + // */ + // char *reg_save_area; + // /* where r3:r10 and f1:f8 (if saved) + // * are stored + // */ + // } va_list[1]; + + SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); + SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + + SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), + PtrVT); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + PtrVT); + + uint64_t FrameOffset = PtrVT.getSizeInBits()/8; + SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); + + uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; + SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); + + uint64_t FPROffset = 1; + SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); + + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + + // Store first byte : number of int regs + SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, + Op.getOperand(1), + MachinePointerInfo(SV), + MVT::i8, false, false, 0); + uint64_t nextOffset = FPROffset; + SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), + ConstFPROffset); + + // Store second byte : number of float regs + SDValue secondStore = + DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, + MachinePointerInfo(SV, nextOffset), MVT::i8, + false, false, 0); + nextOffset += StackOffset; + nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); + + // Store second word : arguments given on stack + SDValue thirdStore = + DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, + MachinePointerInfo(SV, nextOffset), + false, false, 0); + nextOffset += FrameOffset; + nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); + + // Store third word : arguments given in registers + return DAG.getStore(thirdStore, dl, FR, nextPtr, + MachinePointerInfo(SV, nextOffset), + false, false, 0); + +} + +#include "PPCGenCallingConv.inc" + +// Function whose sole purpose is to kill compiler warnings +// stemming from unused functions included from PPCGenCallingConv.inc. +CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { + return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; +} + +bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return true; +} + +bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs); + + // Skip one register if the first unallocated register has an even register + // number and there are still argument registers available which have not been + // allocated yet. RegNum is actually an index into ArgRegs, which means we + // need to skip a register if RegNum is odd. + if (RegNum != NumArgRegs && RegNum % 2 == 1) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the first + // unallocated register has an odd register number and does not actually + // allocate a register for the current argument. + return false; +} + +bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs); + + // If there is only one Floating-point register left we need to put both f64 + // values of a split ppc_fp128 value on the stack. + if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the two f64 + // values a ppc_fp128 value is split into are both passed in registers or both + // passed on the stack and does not actually allocate a register for the + // current argument. + return false; +} + +/// FPR - The set of FP registers that should be allocated for arguments, +/// on Darwin. +static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, + PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, + PPC::F11, PPC::F12, PPC::F13}; + +/// QFPR - The set of QPX registers that should be allocated for arguments. +static const MCPhysReg QFPR[] = { + PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, + PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; + +/// CalculateStackSlotSize - Calculates the size reserved for this argument on +/// the stack. +static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned ArgSize = ArgVT.getStoreSize(); + if (Flags.isByVal()) + ArgSize = Flags.getByValSize(); + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + return ArgSize; +} + +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || + ArgVT == MVT::v1i128) + Align = 16; + // QPX vector types stored in double-precision are padded to a 32 byte + // boundary. + else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) + Align = 32; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + + return Align; +} + +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs, bool HasQPX) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || + // QPX registers overlap with the scalar FP registers. + (HasQPX && (ArgVT == MVT::v4f32 || + ArgVT == MVT::v4f64 || + ArgVT == MVT::v4i1))) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || + ArgVT == MVT::v1i128) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + +/// EnsureStackAlignment - Round stack frame size up from NumBytes to +/// ensure minimum alignment required for target. +static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, + unsigned NumBytes) { + unsigned TargetAlign = Lowering->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; + NumBytes = (NumBytes + AlignMask) & ~AlignMask; + return NumBytes; +} + +SDValue +PPCTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) + return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + else + return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + } else { + return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + } +} + +SDValue +PPCTargetLowering::LowerFormalArguments_32SVR4( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // 32-bit SVR4 ABI Stack Frame Layout: + // +-----------------------------------+ + // +--> | Back chain | + // | +-----------------------------------+ + // | | Floating-point register save area | + // | +-----------------------------------+ + // | | General register save area | + // | +-----------------------------------+ + // | | CR save word | + // | +-----------------------------------+ + // | | VRSAVE save word | + // | +-----------------------------------+ + // | | Alignment padding | + // | +-----------------------------------+ + // | | Vector register save area | + // | +-----------------------------------+ + // | | Local variable space | + // | +-----------------------------------+ + // | | Parameter list area | + // | +-----------------------------------+ + // | | LR save word | + // | +-----------------------------------+ + // SP--> +--- | Back chain | + // +-----------------------------------+ + // + // Specifications: + // System V Application Binary Interface PowerPC Processor Supplement + // AltiVec Technology Programming Interface Manual + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = 4; + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // Reserve space for the linkage area on the stack. + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + CCInfo.AllocateStack(LinkageSize, PtrByteSize); + + CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + // Arguments stored in registers. + if (VA.isRegLoc()) { + const TargetRegisterClass *RC; + EVT ValVT = VA.getValVT(); + + switch (ValVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("ValVT not supported by formal arguments Lowering"); + case MVT::i1: + case MVT::i32: + RC = &PPC::GPRCRegClass; + break; + case MVT::f32: + if (Subtarget.hasP8Vector()) + RC = &PPC::VSSRCRegClass; + else + RC = &PPC::F4RCRegClass; + break; + case MVT::f64: + if (Subtarget.hasVSX()) + RC = &PPC::VSFRCRegClass; + else + RC = &PPC::F8RCRegClass; + break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + RC = &PPC::VRRCRegClass; + break; + case MVT::v4f32: + RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; + break; + case MVT::v2f64: + case MVT::v2i64: + RC = &PPC::VSHRCRegClass; + break; + case MVT::v4f64: + RC = &PPC::QFRCRegClass; + break; + case MVT::v4i1: + RC = &PPC::QBRCRegClass; + break; + } + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, + ValVT == MVT::i1 ? MVT::i32 : ValVT); + + if (ValVT == MVT::i1) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + + InVals.push_back(ArgValue); + } else { + // Argument stored in memory. + assert(VA.isMemLoc()); + + unsigned ArgSize = VA.getLocVT().getStoreSize(); + int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), + isImmutable); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo(), + false, false, false, 0)); + } + } + + // Assign locations to all of the incoming aggregate by value arguments. + // Aggregates passed by value are stored in the local variable space of the + // caller's stack frame, right above the parameter list area. + SmallVector<CCValAssign, 16> ByValArgLocs; + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + ByValArgLocs, *DAG.getContext()); + + // Reserve stack space for the allocations in CCInfo. + CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); + + CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); + + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + MinReservedArea = std::max(MinReservedArea, LinkageSize); + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized function's reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + SmallVector<SDValue, 8> MemOps; + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + static const MCPhysReg GPArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); + + static const MCPhysReg FPArgRegs[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + + if (Subtarget.useSoftFloat()) + NumFPArgRegs = 0; + + FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); + FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); + + // Make room for NumGPArgRegs and NumFPArgRegs. + int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; + + FuncInfo->setVarArgsStackOffset( + MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, + CCInfo.getNextStackOffset(), true)); + + FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // The fixed integer arguments of a variadic function are stored to the + // VarArgsFrameIndex on the stack so that they may be loaded by deferencing + // the result of va_next. + for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { + // Get an existing live-in vreg, or add a new one. + unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); + if (!VReg) + VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + + // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 + // is set. + // The double arguments are stored to the VarArgsFrameIndex + // on the stack. + for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { + // Get an existing live-in vreg, or add a new one. + unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); + if (!VReg) + VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by eight for the next argument to store + SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, + PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +// PPC64 passes i8, i16, and i32 values in i64 registers. Promote +// value to MVT::i64 and then truncate to the correct register size. +SDValue +PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, + SelectionDAG &DAG, SDValue ArgVal, + SDLoc dl) const { + if (Flags.isSExt()) + ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + else if (Flags.isZExt()) + ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + + return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); +} + +SDValue +PPCTargetLowering::LowerFormalArguments_64SVR4( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // TODO: add description of PPC stack frame format, or at least some docs. + // + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + assert(!(CallConv == CallingConv::Fast && isVarArg) && + "fastcc not supported on varargs functions"); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = 8; + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + static const MCPhysReg VSRH[] = { + PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, + PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 + }; + + const unsigned Num_GPR_Regs = array_lengthof(GPR); + const unsigned Num_FPR_Regs = 13; + const unsigned Num_VR_Regs = array_lengthof(VR); + const unsigned Num_QFPR_Regs = Num_FPR_Regs; + + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (Ins[i].Flags.isNest()) + continue; + + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) + HasParameterArea = true; + } + + // Add DAG nodes to load the arguments or copy them out of registers. On + // entry to a function on PPC, the arguments start after the linkage area, + // although the first ones are often in registers. + + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned &QFPR_idx = FPR_idx; + SmallVector<SDValue, 8> MemOps; + Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { + SDValue ArgVal; + bool needsLoad = false; + EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; + unsigned ObjSize = ObjectVT.getStoreSize(); + unsigned ArgSize = ObjSize; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + if (Ins[ArgNo].isOrigArg()) { + std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[ArgNo].getOrigArgIndex(); + } + // We re-align the argument offset for each argument, except when using the + // fast calling convention, when we need to make sure we do that only when + // we'll actually use a stack slot. + unsigned CurArgOffset, Align; + auto ComputeArgOffset = [&]() { + /* Respect alignment of argument on the stack. */ + Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + CurArgOffset = ArgOffset; + }; + + if (CallConv != CallingConv::Fast) { + ComputeArgOffset(); + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); + } + + // FIXME the codegen can be much improved in some cases. + // We do not have to keep everything in memory. + if (Flags.isByVal()) { + assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); + + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + + // ObjSize is the true size, ArgSize rounded up to multiple of registers. + ObjSize = Flags.getByValSize(); + ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // Empty aggregate parameters do not take up registers. Examples: + // struct { } a; + // union { } b; + // int c[0]; + // etc. However, we have to provide a place-holder in InVals, so + // pretend we have an 8-byte item at the current address for that + // purpose. + if (!ObjSize) { + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(FIN); + continue; + } + + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store; + + if (ObjSize==1 || ObjSize==2 || ObjSize==4) { + EVT ObjType = (ObjSize == 1 ? MVT::i8 : + (ObjSize == 2 ? MVT::i16 : MVT::i32)); + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, + MachinePointerInfo(&*FuncArg), ObjType, + false, false, 0); + } else { + // For sizes that don't fit a truncating store (3, 5, 6, 7), + // store the whole register as-is to the parameter save area + // slot. + Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg), false, false, 0); + } + + MemOps.push_back(Store); + } + // Whether we copied from a register or not, advance the offset + // into the parameter save area by a full doubleword. + ArgOffset += PtrByteSize; + continue; + } + + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. + for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { + if (GPR_idx == Num_GPR_Regs) + break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, dl, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); + } + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(&*FuncArg, j), false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + } + ArgOffset += ArgSize; + continue; + } + + switch (ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (Flags.isNest()) { + // The 'nest' parameter, if any, is passed in R11. + unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + + break; + } + + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + // PPC64 passes i8, i16, and i32 values in i64 registers. Promote + // value to MVT::i64 and then truncate to the correct register size. + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + + needsLoad = true; + ArgSize = PtrByteSize; + } + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += 8; + break; + + case MVT::f32: + case MVT::f64: + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + if (FPR_idx != Num_FPR_Regs) { + unsigned VReg; + + if (ObjectVT == MVT::f32) + VReg = MF.addLiveIn(FPR[FPR_idx], + Subtarget.hasP8Vector() + ? &PPC::VSSRCRegClass + : &PPC::F4RCRegClass); + else + VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() + ? &PPC::VSFRCRegClass + : &PPC::F8RCRegClass); + + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { + // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 + // once we support fp <-> gpr moves. + + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, dl, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); + } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + + needsLoad = true; + } + + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + if (CallConv != CallingConv::Fast || needsLoad) { + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + case MVT::v1i128: + if (!Subtarget.hasQPX()) { + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? + MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : + MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++VR_idx; + } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + + needsLoad = true; + } + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += 16; + break; + } // not QPX + + assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && + "Invalid QPX parameter type"); + /* fall through */ + + case MVT::v4f64: + case MVT::v4i1: + // QPX vectors are treated like their scalar floating-point subregisters + // (except that they're larger). + unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; + if (QFPR_idx != Num_QFPR_Regs) { + const TargetRegisterClass *RC; + switch (ObjectVT.getSimpleVT().SimpleTy) { + case MVT::v4f64: RC = &PPC::QFRCRegClass; break; + case MVT::v4f32: RC = &PPC::QSRCRegClass; break; + default: RC = &PPC::QBRCRegClass; break; + } + + unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++QFPR_idx; + } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + needsLoad = true; + } + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += Sz; + break; + } + + // We need to load the argument to a virtual register if we determined + // above that we ran out of physical registers of the appropriate type. + if (needsLoad) { + if (ObjSize < ArgSize && !isLittleEndian) + CurArgOffset += ArgSize - ObjSize; + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, false, 0); + } + + InVals.push_back(ArgVal); + } + + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea; + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized functions' reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + int Depth = ArgOffset; + + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(PtrByteSize, Depth, true)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing the + // result of va_next. + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +SDValue +PPCTargetLowering::LowerFormalArguments_Darwin( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // TODO: add description of PPC stack frame format, or at least some docs. + // + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = isPPC64 ? 8 : 4; + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + unsigned ArgOffset = LinkageSize; + // Area that is at least reserved in caller of this function. + unsigned MinReservedArea = ArgOffset; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + + const unsigned Num_GPR_Regs = array_lengthof(GPR_32); + const unsigned Num_FPR_Regs = 13; + const unsigned Num_VR_Regs = array_lengthof( VR); + + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + + // In 32-bit non-varargs functions, the stack space for vectors is after the + // stack space for non-vectors. We do not use this space unless we have + // too many vectors to fit in registers, something that only occurs in + // constructed examples:), but we have to walk the arglist to figure + // that out...for the pathological case, compute VecArgOffset as the + // start of the vector parameter area. Computing VecArgOffset is the + // entire point of the following loop. + unsigned VecArgOffset = ArgOffset; + if (!isVarArg && !isPPC64) { + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; + ++ArgNo) { + EVT ObjectVT = Ins[ArgNo].VT; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + + if (Flags.isByVal()) { + // ObjSize is the true size, ArgSize rounded up to multiple of regs. + unsigned ObjSize = Flags.getByValSize(); + unsigned ArgSize = + ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + VecArgOffset += ArgSize; + continue; + } + + switch(ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + case MVT::f32: + VecArgOffset += 4; + break; + case MVT::i64: // PPC64 + case MVT::f64: + // FIXME: We are guaranteed to be !isPPC64 at this point. + // Does MVT::i64 apply? + VecArgOffset += 8; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + // Nothing to do, we're only looking at Nonvector args here. + break; + } + } + } + // We've found where the vector parameter area in memory is. Skip the + // first 12 parameters; these don't use that memory. + VecArgOffset = ((VecArgOffset+15)/16)*16; + VecArgOffset += 12*16; + + // Add DAG nodes to load the arguments or copy them out of registers. On + // entry to a function on PPC, the arguments start after the linkage area, + // although the first ones are often in registers. + + SmallVector<SDValue, 8> MemOps; + unsigned nAltivecParamsAtEnd = 0; + Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { + SDValue ArgVal; + bool needsLoad = false; + EVT ObjectVT = Ins[ArgNo].VT; + unsigned ObjSize = ObjectVT.getSizeInBits()/8; + unsigned ArgSize = ObjSize; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + if (Ins[ArgNo].isOrigArg()) { + std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[ArgNo].getOrigArgIndex(); + } + unsigned CurArgOffset = ArgOffset; + + // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. + if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || + ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { + if (isVarArg || isPPC64) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += CalculateStackSlotSize(ObjectVT, + Flags, + PtrByteSize); + } else nAltivecParamsAtEnd++; + } else + // Calculate min reserved area. + MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, + Flags, + PtrByteSize); + + // FIXME the codegen can be much improved in some cases. + // We do not have to keep everything in memory. + if (Flags.isByVal()) { + assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); + + // ObjSize is the true size, ArgSize rounded up to multiple of registers. + ObjSize = Flags.getByValSize(); + ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // Objects of size 1 and 2 are right justified, everything else is + // left justified. This means the memory address is adjusted forwards. + if (ObjSize==1 || ObjSize==2) { + CurArgOffset = CurArgOffset + (4 - ObjSize); + } + // The value of the object is its address. + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(FIN); + if (ObjSize==1 || ObjSize==2) { + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg; + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; + SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg), + ObjType, false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + } + + ArgOffset += PtrByteSize; + + continue; + } + for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { + // Store whatever pieces of the object are in registers + // to memory. ArgOffset will be the address of the beginning + // of the object. + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg; + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg, j), false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + ArgOffset += PtrByteSize; + } else { + ArgOffset += ArgSize - (ArgOffset-CurArgOffset); + break; + } + } + continue; + } + + switch (ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + if (!isPPC64) { + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + + if (ObjectVT == MVT::i1) + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); + + ++GPR_idx; + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + // All int arguments reserve stack space in the Darwin ABI. + ArgOffset += PtrByteSize; + break; + } + // FALLTHROUGH + case MVT::i64: // PPC64 + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + // PPC64 passes i8, i16, and i32 values in i64 registers. Promote + // value to MVT::i64 and then truncate to the correct register size. + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + + ++GPR_idx; + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + // All int arguments reserve stack space in the Darwin ABI. + ArgOffset += 8; + break; + + case MVT::f32: + case MVT::f64: + // Every 4 bytes of argument space consumes one of the GPRs available for + // argument passing. + if (GPR_idx != Num_GPR_Regs) { + ++GPR_idx; + if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) + ++GPR_idx; + } + if (FPR_idx != Num_FPR_Regs) { + unsigned VReg; + + if (ObjectVT == MVT::f32) + VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); + else + VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); + + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++FPR_idx; + } else { + needsLoad = true; + } + + // All FP arguments reserve stack space in the Darwin ABI. + ArgOffset += isPPC64 ? 8 : ObjSize; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + // Note that vector arguments in registers don't reserve stack space, + // except in varargs functions. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + if (isVarArg) { + while ((ArgOffset % 16) != 0) { + ArgOffset += PtrByteSize; + if (GPR_idx != Num_GPR_Regs) + GPR_idx++; + } + ArgOffset += 16; + GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? + } + ++VR_idx; + } else { + if (!isVarArg && !isPPC64) { + // Vectors go after all the nonvectors. + CurArgOffset = VecArgOffset; + VecArgOffset += 16; + } else { + // Vectors are aligned. + ArgOffset = ((ArgOffset+15)/16)*16; + CurArgOffset = ArgOffset; + ArgOffset += 16; + } + needsLoad = true; + } + break; + } + + // We need to load the argument to a virtual register if we determined above + // that we ran out of physical registers of the appropriate type. + if (needsLoad) { + int FI = MFI->CreateFixedObject(ObjSize, + CurArgOffset + (ArgSize - ObjSize), + isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, false, 0); + } + + InVals.push_back(ArgVal); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + + // Area that is at least reserved in the caller of this function. + MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized functions' reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + int Depth = ArgOffset; + + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, + Depth, true)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing the + // result of va_next. + for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + unsigned VReg; + + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be +/// adjusted to accommodate the arguments for the tailcall. +static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, + unsigned ParamSize) { + + if (!isTailCall) return 0; + + PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); + unsigned CallerMinReservedArea = FI->getMinReservedArea(); + int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; + // Remember only if the new adjustement is bigger. + if (SPDiff < FI->getTailCallSPDelta()) + FI->setTailCallSPDelta(SPDiff); + + return SPDiff; +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + if (!getTargetMachine().Options.GuaranteedTailCallOpt) + return false; + + // Variable argument functions are not supported. + if (isVarArg) + return false; + + MachineFunction &MF = DAG.getMachineFunction(); + CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { + // Functions containing by val parameters are not supported. + for (unsigned i = 0; i != Ins.size(); i++) { + ISD::ArgFlagsTy Flags = Ins[i].Flags; + if (Flags.isByVal()) return false; + } + + // Non-PIC/GOT tail calls are supported. + if (getTargetMachine().getRelocationModel() != Reloc::PIC_) + return true; + + // At the moment we can only do local tail calls (in same module, hidden + // or protected) if we are generating PIC. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + return G->getGlobal()->hasHiddenVisibility() + || G->getGlobal()->hasProtectedVisibility(); + } + + return false; +} + +/// isCallCompatibleAddress - Return the immediate to use if the specified +/// 32-bit value is representable in the immediate field of a BxA instruction. +static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) return nullptr; + + int Addr = C->getZExtValue(); + if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. + SignExtend32<26>(Addr) != Addr) + return nullptr; // Top 6 bits have to be sext of immediate. + + return DAG.getConstant((int)C->getZExtValue() >> 2, SDLoc(Op), + DAG.getTargetLoweringInfo().getPointerTy( + DAG.getDataLayout())).getNode(); +} + +namespace { + +struct TailCallArgumentInfo { + SDValue Arg; + SDValue FrameIdxOp; + int FrameIdx; + + TailCallArgumentInfo() : FrameIdx(0) {} +}; +} + +/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. +static void +StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG, + SDValue Chain, + const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, + SmallVectorImpl<SDValue> &MemOpChains, + SDLoc dl) { + for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { + SDValue Arg = TailCallArgs[i].Arg; + SDValue FIN = TailCallArgs[i].FrameIdxOp; + int FI = TailCallArgs[i].FrameIdx; + // Store relative to framepointer. + MemOpChains.push_back(DAG.getStore( + Chain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, 0)); + } +} + +/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to +/// the appropriate stack slot for the tail call optimized function call. +static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, + MachineFunction &MF, + SDValue Chain, + SDValue OldRetAddr, + SDValue OldFP, + int SPDiff, + bool isPPC64, + bool isDarwinABI, + SDLoc dl) { + if (SPDiff) { + // Calculate the new stack slot for the return address. + int SlotSize = isPPC64 ? 8 : 4; + const PPCFrameLowering *FL = + MF.getSubtarget<PPCSubtarget>().getFrameLowering(); + int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); + int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize, + NewRetAddrLoc, true); + EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); + Chain = DAG.getStore( + Chain, dl, OldRetAddr, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr), + false, false, 0); + + // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack + // slot as the FP is never overwritten. + if (isDarwinABI) { + int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); + int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, + true); + SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); + Chain = DAG.getStore( + Chain, dl, OldFP, NewFramePtrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx), + false, false, 0); + } + } + return Chain; +} + +/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate +/// the position of the argument. +static void +CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, + SDValue Arg, int SPDiff, unsigned ArgOffset, + SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { + int Offset = ArgOffset + SPDiff; + uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8; + int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + SDValue FIN = DAG.getFrameIndex(FI, VT); + TailCallArgumentInfo Info; + Info.Arg = Arg; + Info.FrameIdxOp = FIN; + Info.FrameIdx = FI; + TailCallArguments.push_back(Info); +} + +/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address +/// stack slot. Returns the chain as result and the loaded frame pointers in +/// LROpOut/FPOpout. Used when tail calling. +SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, + int SPDiff, + SDValue Chain, + SDValue &LROpOut, + SDValue &FPOpOut, + bool isDarwinABI, + SDLoc dl) const { + if (SPDiff) { + // Load the LR and FP stack slot for later adjusting. + EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + LROpOut = getReturnAddrFrameIndex(DAG); + LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(), + false, false, false, 0); + Chain = SDValue(LROpOut.getNode(), 1); + + // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack + // slot as the FP is never overwritten. + if (isDarwinABI) { + FPOpOut = getFramePointerFrameIndex(DAG); + FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(), + false, false, false, 0); + Chain = SDValue(FPOpOut.getNode(), 1); + } + } + return Chain; +} + +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" of size "Size". Alignment information is +/// specified by the specific parameter attribute. The copy will be passed as +/// a byval function parameter. +/// Sometimes what we are copying is the end of a larger object, the part that +/// does not fit in registers. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + SDLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + false, false, false, MachinePointerInfo(), + MachinePointerInfo()); +} + +/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of +/// tail calls. +static void +LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, + SDValue Arg, SDValue PtrOff, int SPDiff, + unsigned ArgOffset, bool isPPC64, bool isTailCall, + bool isVector, SmallVectorImpl<SDValue> &MemOpChains, + SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, + SDLoc dl) { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + if (!isTailCall) { + if (isVector) { + SDValue StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, + DAG.getConstant(ArgOffset, dl, PtrVT)); + } + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); + // Calculate and remember argument location. + } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, + TailCallArguments); +} + +static +void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, + SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes, + SDValue LROp, SDValue FPOp, bool isDarwinABI, + SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Emit a sequence of copyto/copyfrom virtual registers for arguments that + // might overwrite each other in case of tail call optimization. + SmallVector<SDValue, 8> MemOpChains2; + // Do not flag preceding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, + MemOpChains2, dl); + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff, + isPPC64, isDarwinABI, dl); + + // Emit callseq_end just before tailcall node. + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + InFlag = Chain.getValue(1); +} + +// Is this global address that of a function that can be called by name? (as +// opposed to something that must hold a descriptor for an indirect call). +static bool isFunctionGlobalAddress(SDValue Callee) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (Callee.getOpcode() == ISD::GlobalTLSAddress || + Callee.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + + return G->getGlobal()->getType()->getElementType()->isFunctionTy(); + } + + return false; +} + +static +unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, + SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff, + bool isTailCall, bool IsPatchPoint, bool hasNest, + SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass, + SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, + ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { + + bool isPPC64 = Subtarget.isPPC64(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. + + unsigned CallOpc = PPCISD::CALL; + + bool needIndirectCall = true; + if (!isSVR4ABI || !isPPC64) + if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { + // If this is an absolute destination address, use the munged value. + Callee = SDValue(Dest, 0); + needIndirectCall = false; + } + + if (isFunctionGlobalAddress(Callee)) { + GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); + // A call to a TLS address is actually an indirect call to a + // thread-specific pointer. + unsigned OpFlags = 0; + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + !G->getGlobal()->isStrongDefinitionForLinker()) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + Callee.getValueType(), 0, OpFlags); + needIndirectCall = false; + } + + if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + unsigned char OpFlags = 0; + + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || + (Subtarget.isTargetELF() && !isPPC64 && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; + } + + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), + OpFlags); + needIndirectCall = false; + } + + if (IsPatchPoint) { + // We'll form an invalid direct call when lowering a patchpoint; the full + // sequence for an indirect call is complicated, and many of the + // instructions introduced might have side effects (and, thus, can't be + // removed later). The call itself will be removed as soon as the + // argument/return lowering is complete, so the fact that it has the wrong + // kind of operands should not really matter. + needIndirectCall = false; + } + + if (needIndirectCall) { + // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair + // to do the call, we can't use PPCISD::CALL. + SDValue MTCTROps[] = {Chain, Callee, InFlag}; + + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { + // Function pointers in the 64-bit SVR4 ABI do not point to the function + // entry point, but to the function descriptor (the function entry point + // address is part of the function descriptor though). + // The function descriptor is a three doubleword structure with the + // following fields: function entry point, TOC base address and + // environment pointer. + // Thus for a call through a function pointer, the following actions need + // to be performed: + // 1. Save the TOC of the caller in the TOC save area of its stack + // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). + // 2. Load the address of the function entry point from the function + // descriptor. + // 3. Load the TOC of the callee from the function descriptor into r2. + // 4. Load the environment pointer from the function descriptor into + // r11. + // 5. Branch to the function entry point address. + // 6. On return of the callee, the TOC of the caller needs to be + // restored (this is done in FinishCall()). + // + // The loads are scheduled at the beginning of the call sequence, and the + // register copies are flagged together to ensure that no other + // operations can be scheduled in between. E.g. without flagging the + // copies together, a TOC access in the caller could be scheduled between + // the assignment of the callee TOC and the branch to the callee, which + // results in the TOC access going through the TOC of the callee instead + // of going through the TOC of the caller, which leads to incorrect code. + + // Load the address of the function entry point from the function + // descriptor. + SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); + if (LDChain.getValueType() == MVT::Glue) + LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); + + bool LoadsInv = Subtarget.hasInvariantFunctionDescriptors(); + + MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); + SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, + false, false, LoadsInv, 8); + + // Load environment pointer into r11. + SDValue PtrOff = DAG.getIntPtrConstant(16, dl); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); + SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, + MPI.getWithOffset(16), false, false, + LoadsInv, 8); + + SDValue TOCOff = DAG.getIntPtrConstant(8, dl); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); + SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, + MPI.getWithOffset(8), false, false, + LoadsInv, 8); + + setUsesTOCBasePtr(DAG); + SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, + InFlag); + Chain = TOCVal.getValue(0); + InFlag = TOCVal.getValue(1); + + // If the function call has an explicit 'nest' parameter, it takes the + // place of the environment pointer. + if (!hasNest) { + SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, + InFlag); + + Chain = EnvVal.getValue(0); + InFlag = EnvVal.getValue(1); + } + + MTCTROps[0] = Chain; + MTCTROps[1] = LoadFuncPtr; + MTCTROps[2] = InFlag; + } + + Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, + makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); + InFlag = Chain.getValue(1); + + NodeTys.clear(); + NodeTys.push_back(MVT::Other); + NodeTys.push_back(MVT::Glue); + Ops.push_back(Chain); + CallOpc = PPCISD::BCTRL; + Callee.setNode(nullptr); + // Add use of X11 (holding environment pointer) + if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) + Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); + // Add CTR register as callee so a bctr can be emitted later. + if (isTailCall) + Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); + } + + // If this is a direct call, pass the chain and the callee. + if (Callee.getNode()) { + Ops.push_back(Chain); + Ops.push_back(Callee); + } + // If this is a tail call add stack pointer delta. + if (isTailCall) + Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live + // into the call. + if (isSVR4ABI && isPPC64 && !IsPatchPoint) { + setUsesTOCBasePtr(DAG); + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + } + + return CallOpc; +} + +static +bool isLocalCall(const SDValue &Callee) +{ + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + return G->getGlobal()->isStrongDefinitionForLinker(); + return false; +} + +SDValue +PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +SDValue +PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, + bool isTailCall, bool isVarArg, bool IsPatchPoint, + bool hasNest, SelectionDAG &DAG, + SmallVector<std::pair<unsigned, SDValue>, 8> + &RegsToPass, + SDValue InFlag, SDValue Chain, + SDValue CallSeqStart, SDValue &Callee, + int SPDiff, unsigned NumBytes, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const { + + std::vector<EVT> NodeTys; + SmallVector<SDValue, 8> Ops; + unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, + SPDiff, isTailCall, IsPatchPoint, hasNest, + RegsToPass, Ops, NodeTys, CS, Subtarget); + + // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls + if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); + + // When performing tail call optimization the callee pops its arguments off + // the stack. Account for this here so these bytes can be pushed back on in + // PPCFrameLowering::eliminateCallFramePseudoInstr. + int BytesCalleePops = + (CallConv == CallingConv::Fast && + getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; + + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = + TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + // Emit tail call. + if (isTailCall) { + assert(((Callee.getOpcode() == ISD::Register && + cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || + Callee.getOpcode() == ISD::TargetExternalSymbol || + Callee.getOpcode() == ISD::TargetGlobalAddress || + isa<ConstantSDNode>(Callee)) && + "Expecting an global address, external symbol, absolute value or register"); + + DAG.getMachineFunction().getFrameInfo()->setHasTailCall(); + return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); + } + + // Add a NOP immediately after the branch instruction when using the 64-bit + // SVR4 ABI. At link time, if caller and callee are in a different module and + // thus have a different TOC, the call will be replaced with a call to a stub + // function which saves the current TOC, loads the TOC of the callee and + // branches to the callee. The NOP will be replaced with a load instruction + // which restores the TOC of the caller from the TOC save slot of the current + // stack frame. If caller and callee belong to the same module (and have the + // same TOC), the NOP will remain unchanged. + + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && + !IsPatchPoint) { + if (CallOpc == PPCISD::BCTRL) { + // This is a call through a function pointer. + // Restore the caller TOC from the save area into R2. + // See PrepareCall() for more information about calls through function + // pointers in the 64-bit SVR4 ABI. + // We are using a target-specific load with r2 hard coded, because the + // result of a target-independent load would never go directly into r2, + // since r2 is a reserved register (which prevents the register allocator + // from allocating it), resulting in an additional register being + // allocated and an unnecessary move instruction being generated. + CallOpc = PPCISD::BCTRL_LOAD_TOC; + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + + // The address needs to go after the chain input but before the flag (or + // any other variadic arguments). + Ops.insert(std::next(Ops.begin()), AddTOC); + } else if ((CallOpc == PPCISD::CALL) && + (!isLocalCall(Callee) || + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) + // Otherwise insert NOP for non-local calls. + CallOpc = PPCISD::CALL_NOP; + } + + Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); + InFlag = Chain.getValue(1); + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + DAG.getIntPtrConstant(BytesCalleePops, dl, true), + InFlag, dl); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, + Ins, dl, DAG, InVals); +} + +SDValue +PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + bool IsPatchPoint = CLI.IsPatchPoint; + ImmutableCallSite *CS = CLI.CS; + + if (isTailCall) + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, + Ins, DAG); + + if (!isTailCall && CS && CS->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + else + return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + } + + return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, + isTailCall, IsPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); +} + +SDValue +PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const { + // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description + // of the 32-bit SVR4 ABI stack frame layout. + + assert((CallConv == CallingConv::C || + CallConv == CallingConv::Fast) && "Unknown calling convention!"); + + unsigned PtrByteSize = 4; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, parameter list area and the part of the local variable space which + // contains copies of aggregates which are passed by value. + + // Assign locations to all of the outgoing arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // Reserve space for the linkage area on the stack. + CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), + PtrByteSize); + + if (isVarArg) { + // Handle fixed and variable vector arguments differently. + // Fixed vector arguments go into registers as long as registers are + // available. Variable vector arguments always go into memory. + unsigned NumArgs = Outs.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ArgVT = Outs[i].VT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + bool Result; + + if (Outs[i].IsFixed) { + Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, + CCInfo); + } else { + Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, + ArgFlags, CCInfo); + } + + if (Result) { +#ifndef NDEBUG + errs() << "Call operand #" << i << " has unhandled type " + << EVT(ArgVT).getEVTString() << "\n"; +#endif + llvm_unreachable(nullptr); + } + } + } else { + // All arguments are treated the same. + CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); + } + + // Assign locations to all of the outgoing aggregate by value arguments. + SmallVector<CCValAssign, 16> ByValArgLocs; + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + ByValArgLocs, *DAG.getContext()); + + // Reserve stack space for the allocations in CCInfo. + CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); + + CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); + + // Size of the linkage area, parameter list area and the part of the local + // space variable where copies of aggregates which are passed by value are + // stored. + unsigned NumBytes = CCByValInfo.getNextStackOffset(); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be moved somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + SmallVector<SDValue, 8> MemOpChains; + + bool seenFloatArg = false; + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, j = 0, e = ArgLocs.size(); + i != e; + ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + if (Flags.isByVal()) { + // Argument is an aggregate which is passed by value, thus we need to + // create a copy of it in the local variable space of the current stack + // frame (which is the stack frame of the caller) and pass the address of + // this copy to the callee. + assert((j < ByValArgLocs.size()) && "Index out of bounds!"); + CCValAssign &ByValVA = ByValArgLocs[j++]; + assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); + + // Memory reserved in the local variable space of the callers stack frame. + unsigned LocMemOffset = ByValVA.getLocMemOffset(); + + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), + StackPtr, PtrOff); + + // Create a copy of the argument in the local area of the current + // stack frame. + SDValue MemcpyCall = + CreateCopyOfByValArgument(Arg, PtrOff, + CallSeqStart.getNode()->getOperand(0), + Flags, DAG, dl); + + // This must go outside the CALLSEQ_START..END. + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); + DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), + NewCallSeqStart.getNode()); + Chain = CallSeqStart = NewCallSeqStart; + + // Pass the address of the aggregate copy on the stack either in a + // physical register or in the parameter list area of the current stack + // frame to the callee. + Arg = PtrOff; + } + + if (VA.isRegLoc()) { + if (Arg.getValueType() == MVT::i1) + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); + + seenFloatArg |= VA.getLocVT().isFloatingPoint(); + // Put argument in a physical register. + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + // Put argument in the parameter list area of the current stack frame. + assert(VA.isMemLoc()); + unsigned LocMemOffset = VA.getLocMemOffset(); + + if (!isTailCall) { + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), + StackPtr, PtrOff); + + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), + false, false, 0)); + } else { + // Calculate and remember argument location. + CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, + TailCallArguments); + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // Set CR bit 6 to true if this is a vararg call with floating args passed in + // registers. + if (isVarArg) { + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, InFlag }; + + Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, + dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); + + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, + false, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); +} + +// Copy an argument into memory, being careful to do this outside the +// call sequence for the call to which the argument belongs. +SDValue +PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, + SDValue CallSeqStart, + ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, + SDLoc dl) const { + SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, + CallSeqStart.getNode()->getOperand(0), + Flags, DAG, dl); + // The MEMCPY must go outside the CALLSEQ_START..END. + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); + DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), + NewCallSeqStart.getNode()); + return NewCallSeqStart; +} + +SDValue +PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); + unsigned NumOps = Outs.size(); + bool hasNest = false; + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + unsigned PtrByteSize = 8; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + assert(!(CallConv == CallingConv::Fast && isVarArg) && + "fastcc not supported on varargs functions"); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + unsigned NumBytes = LinkageSize; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned &QFPR_idx = FPR_idx; + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + static const MCPhysReg VSRH[] = { + PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, + PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 + }; + + const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + const unsigned NumQFPRs = NumFPRs; + + // When using the fast calling convention, we don't provide backing for + // arguments that will be in registers. + unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + if (Flags.isNest()) + continue; + + if (CallConv == CallingConv::Fast) { + if (Flags.isByVal()) + NumGPRsUsed += (Flags.getByValSize()+7)/8; + else + switch (ArgVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (++NumGPRsUsed <= NumGPRs) + continue; + break; + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + case MVT::v1i128: + if (++NumVRsUsed <= NumVRs) + continue; + break; + case MVT::v4f32: + // When using QPX, this is handled like a FP register, otherwise, it + // is an Altivec register. + if (Subtarget.hasQPX()) { + if (++NumFPRsUsed <= NumFPRs) + continue; + } else { + if (++NumVRsUsed <= NumVRs) + continue; + } + break; + case MVT::f32: + case MVT::f64: + case MVT::v4f64: // QPX + case MVT::v4i1: // QPX + if (++NumFPRsUsed <= NumFPRs) + continue; + break; + } + } + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; + + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + + unsigned NumBytesActuallyUsed = NumBytes; + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // To protect arguments on the stack from being clobbered in a tail call, + // force all the loads to happen before doing any other lowering. + if (isTailCall) + Chain = DAG.getStackArgumentTokenFactor(Chain); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be move somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = LinkageSize; + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + + SmallVector<SDValue, 8> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDValue PtrOff; + + // We re-align the argument offset for each argument, except when using the + // fast calling convention, when we need to make sure we do that only when + // we'll actually use a stack slot. + auto ComputePtrOff = [&]() { + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + }; + + if (CallConv != CallingConv::Fast) { + ComputePtrOff(); + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); + } + + // Promote integers to 64-bit values. + if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { + // FIXME: Should this use ANY_EXTEND if neither sext nor zext? + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); + } + + // FIXME memcpy is used way more than necessary. Correctness first. + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) { + // Note: Size includes alignment padding, so + // struct x { short a; char b; } + // will have Size = 4. With #pragma pack(1), it will have Size = 3. + // These are the proper values we need for right-justifying the + // aggregate in a parameter register. + unsigned Size = Flags.getByValSize(); + + // An empty aggregate parameter takes up no storage and no + // registers. + if (Size == 0) + continue; + + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + + // All aggregates smaller than 8 bytes must be passed right-justified. + if (Size==1 || Size==2 || Size==4) { + EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, + MachinePointerInfo(), VT, + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + + ArgOffset += PtrByteSize; + continue; + } + } + + if (GPR_idx == NumGPRs && Size < 8) { + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, + PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; + continue; + } + // Copy entire object into memory. There are cases where gcc-generated + // code assumes it is there, even if it could be put entirely into + // registers. (This is not what the doc says.) + + // FIXME: The above statement is likely due to a misunderstanding of the + // documents. All arguments must be copied into the parameter area BY + // THE CALLEE in the event that the callee takes the address of any + // formal argument. That has not yet been implemented. However, it is + // reasonable to use the stack area as a staging area for the register + // load. + + // Skip this for small aggregates, as we will use the same slot for a + // right-justified copy, below. + if (Size >= 8) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // When a register is available, pass a small aggregate right-justified. + if (Size < 8 && GPR_idx != NumGPRs) { + // The easiest way to get this right-justified in a register + // is to copy the structure into the rightmost portion of a + // local variable slot, then load the whole slot into the + // register. + // FIXME: The memcpy seems to produce pretty awful code for + // small aggregates, particularly for packed ones. + // FIXME: It would be preferable to use the slot in the + // parameter save area instead of a new local variable. + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + + // Load the slot into the register. + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + + // Done with this argument. + ArgOffset += PtrByteSize; + continue; + } + + // For aggregates larger than PtrByteSize, copy the pieces of the + // object that fit into registers from the parameter save area. + for (unsigned j=0; j<Size; j+=PtrByteSize) { + SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + ArgOffset += PtrByteSize; + } else { + ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; + break; + } + } + continue; + } + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (Flags.isNest()) { + // The 'nest' parameter, if any, is passed in R11. + RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); + hasNest = true; + break; + } + + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. + if (GPR_idx != NumGPRs) { + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, false, MemOpChains, + TailCallArguments, dl); + if (CallConv == CallingConv::Fast) + ArgOffset += PtrByteSize; + } + if (CallConv != CallingConv::Fast) + ArgOffset += PtrByteSize; + break; + case MVT::f32: + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + bool NeededLoad = false; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { + // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 + // once we support fp <-> gpr moves. + + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, dl, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); + } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + + // Single-precision floating-point values are mapped to the + // second (rightmost) word of the stack doubleword. + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { + SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); + } + + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, false, MemOpChains, + TailCallArguments, dl); + + NeededLoad = true; + } + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + if (CallConv != CallingConv::Fast || NeededLoad) { + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + break; + } + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + case MVT::v1i128: + if (!Subtarget.hasQPX()) { + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + + // For a varargs call, named arguments go into VRs or on the stack as + // usual; unnamed arguments always go to the stack or the corresponding + // GPRs when within range. For now, we always put the value in both + // locations (or even all three). + if (isVarArg) { + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + if (VR_idx != NumVRs) { + SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + + unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || + Arg.getSimpleValueType() == MVT::v2i64) ? + VSRH[VR_idx] : VR[VR_idx]; + ++VR_idx; + + RegsToPass.push_back(std::make_pair(VReg, Load)); + } + ArgOffset += 16; + for (unsigned i=0; i<16; i+=PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, dl, PtrVT)); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs Altivec params go into VRs or on the stack. + if (VR_idx != NumVRs) { + unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || + Arg.getSimpleValueType() == MVT::v2i64) ? + VSRH[VR_idx] : VR[VR_idx]; + ++VR_idx; + + RegsToPass.push_back(std::make_pair(VReg, Arg)); + } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, true, MemOpChains, + TailCallArguments, dl); + if (CallConv == CallingConv::Fast) + ArgOffset += 16; + } + + if (CallConv != CallingConv::Fast) + ArgOffset += 16; + break; + } // not QPX + + assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && + "Invalid QPX parameter type"); + + /* fall through */ + case MVT::v4f64: + case MVT::v4i1: { + bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; + if (isVarArg) { + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + if (QFPR_idx != NumQFPRs) { + SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, + Store, PtrOff, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); + } + ArgOffset += (IsF32 ? 16 : 32); + for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, dl, PtrVT)); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs QPX params go into registers or on the stack. + if (QFPR_idx != NumQFPRs) { + RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); + } else { + if (CallConv == CallingConv::Fast) + ComputePtrOff(); + + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, true, MemOpChains, + TailCallArguments, dl); + if (CallConv == CallingConv::Fast) + ArgOffset += (IsF32 ? 16 : 32); + } + + if (CallConv != CallingConv::Fast) + ArgOffset += (IsF32 ? 16 : 32); + break; + } + } + } + + assert(NumBytesActuallyUsed == ArgOffset); + (void)NumBytesActuallyUsed; + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Check if this is an indirect call (MTCTR/BCTRL). + // See PrepareCall() for more information about calls through function + // pointers in the 64-bit SVR4 ABI. + if (!isTailCall && !IsPatchPoint && + !isFunctionGlobalAddress(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) { + // Load r2 into a virtual register and store it to the TOC save area. + setUsesTOCBasePtr(DAG); + SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); + // TOC save area offset. + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + Chain = DAG.getStore( + Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset), + false, false, 0); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI && !IsPatchPoint) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); + } + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, + FPOp, true, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest, + DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, + SPDiff, NumBytes, Ins, InVals, CS); +} + +SDValue +PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const { + + unsigned NumOps = Outs.size(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. We start with 24/48 bytes, which is + // prereserved space for [SP][CR][LR][3 x unused]. + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually + // they all go in registers, but we must reserve stack space for them for + // possible use by the caller. In varargs or 64-bit calls, parameters are + // assigned stack space in order, with padding so Altivec parameters are + // 16-byte aligned. + unsigned nAltivecParamsAtEnd = 0; + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + // Varargs Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { + if (!isVarArg && !isPPC64) { + // Non-varargs Altivec parameters go after all the non-Altivec + // parameters; handle those later so we know how much padding we need. + nAltivecParamsAtEnd++; + continue; + } + // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. + NumBytes = ((NumBytes+15)/16)*16; + } + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + NumBytes = ((NumBytes+15)/16)*16; + NumBytes += 16*nAltivecParamsAtEnd; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // To protect arguments on the stack from being clobbered in a tail call, + // force all the loads to happen before doing any other lowering. + if (isTailCall) + Chain = DAG.getStackArgumentTokenFactor(Chain); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be move somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + const unsigned NumGPRs = array_lengthof(GPR_32); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + + SmallVector<SDValue, 8> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDValue PtrOff; + + PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + + // On PPC64, promote integers to 64-bit values. + if (isPPC64 && Arg.getValueType() == MVT::i32) { + // FIXME: Should this use ANY_EXTEND if neither sext nor zext? + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); + } + + // FIXME memcpy is used way more than necessary. Correctness first. + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) { + unsigned Size = Flags.getByValSize(); + // Very small objects are passed right-justified. Everything else is + // passed left-justified. + if (Size==1 || Size==2) { + EVT VT = (Size==1) ? MVT::i8 : MVT::i16; + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, + MachinePointerInfo(), VT, + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + + ArgOffset += PtrByteSize; + } else { + SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, + PtrOff.getValueType()); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; + } + continue; + } + // Copy entire object into memory. There are cases where gcc-generated + // code assumes it is there, even if it could be put entirely into + // registers. (This is not what the doc says.) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // For small aggregates (Darwin only) and aggregates >= PtrByteSize, + // copy the pieces of the object that fit into registers from the + // parameter save area. + for (unsigned j=0; j<Size; j+=PtrByteSize) { + SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + ArgOffset += PtrByteSize; + } else { + ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; + break; + } + } + continue; + } + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (GPR_idx != NumGPRs) { + if (Arg.getValueType() == MVT::i1) + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); + + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + } else { + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, false, MemOpChains, + TailCallArguments, dl); + } + ArgOffset += PtrByteSize; + break; + case MVT::f32: + case MVT::f64: + if (FPR_idx != NumFPRs) { + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + if (isVarArg) { + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + + // Float varargs are always shadowed in available integer registers + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, + MachinePointerInfo(), false, false, + false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ + SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + } else { + // If we have any FPRs remaining, we may also have GPRs remaining. + // Args passed in FPRs consume either 1 (f32) or 2 (f64) available + // GPRs. + if (GPR_idx != NumGPRs) + ++GPR_idx; + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && + !isPPC64) // PPC64 has 64-bit GPR's obviously :) + ++GPR_idx; + } + } else + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, false, MemOpChains, + TailCallArguments, dl); + if (isPPC64) + ArgOffset += 8; + else + ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + if (isVarArg) { + // These go aligned on the stack, or in the corresponding R registers + // when within range. The Darwin PPC ABI doc claims they also go in + // V registers; in fact gcc does this only for arguments that are + // prototyped, not for those that match the ... We do it for all + // arguments, seems to work. + while (ArgOffset % 16 !=0) { + ArgOffset += PtrByteSize; + if (GPR_idx != NumGPRs) + GPR_idx++; + } + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, + DAG.getConstant(ArgOffset, dl, PtrVT)); + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + if (VR_idx != NumVRs) { + SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); + } + ArgOffset += 16; + for (unsigned i=0; i<16; i+=PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, dl, PtrVT)); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs Altivec params generally go in registers, but have + // stack space allocated at the end. + if (VR_idx != NumVRs) { + // Doesn't have GPR space allocated. + RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); + } else if (nAltivecParamsAtEnd==0) { + // We are emitting Altivec params in order. + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, true, MemOpChains, + TailCallArguments, dl); + ArgOffset += 16; + } + break; + } + } + // If all Altivec parameters fit in registers, as they usually do, + // they get stack space following the non-Altivec parameters. We + // don't track this here because nobody below needs it. + // If there are more Altivec parameters than fit in registers emit + // the stores here. + if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { + unsigned j = 0; + // Offset is aligned; skip 1st 12 params which go in V registers. + ArgOffset = ((ArgOffset+15)/16)*16; + ArgOffset += 12*16; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + EVT ArgType = Outs[i].VT; + if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || + ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { + if (++j > NumVRs) { + SDValue PtrOff; + // We are emitting Altivec params in order. + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, true, MemOpChains, + TailCallArguments, dl); + ArgOffset += 16; + } + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // On Darwin, R12 must contain the address of an indirect callee. This does + // not mean the MTCTR instruction must use R12; it's easier to model this as + // an extra parameter, so do that. + if (!isTailCall && + !isFunctionGlobalAddress(Callee) && + !isa<ExternalSymbolSDNode>(Callee) && + !isBLACompatibleAddress(Callee, DAG)) + RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : + PPC::R12), Callee)); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, + FPOp, true, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); +} + +bool +PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_PPC); +} + +SDValue +PPCTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const { + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_PPC); + + SDValue Flag; + SmallVector<SDValue, 4> RetOps(1, Chain); + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = OutVals[i]; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); +} + +SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET( + SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { + SDLoc dl(Op); + + // Get the corect type for integers. + EVT IntVT = Op.getValueType(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNAREAOFFSET node. + SDValue Ops[2] = {Chain, FPSIdx}; + SDVTList VTs = DAG.getVTList(IntVT); + return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); +} + +SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + // When we pop the dynamic allocation we need to restore the SP link. + SDLoc dl(Op); + + // Get the corect type for pointers. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + + // Construct the stack pointer operand. + bool isPPC64 = Subtarget.isPPC64(); + unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; + SDValue StackPtr = DAG.getRegister(SP, PtrVT); + + // Get the operands for the STACKRESTORE. + SDValue Chain = Op.getOperand(0); + SDValue SaveSP = Op.getOperand(1); + + // Load the old link SP. + SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, + MachinePointerInfo(), + false, false, false, 0); + + // Restore the stack pointer. + Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); + + // Store the old link SP. + return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(), + false, false, 0); +} + +SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool isPPC64 = Subtarget.isPPC64(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + + // Get current frame pointer save index. The users of this index will be + // primarily DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int RASI = FI->getReturnAddrSaveIndex(); + + // If the frame pointer save index hasn't been defined yet. + if (!RASI) { + // Find out what the fix offset of the frame pointer save area. + int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); + // Allocate the frame index for frame pointer save area. + RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false); + // Save the result. + FI->setReturnAddrSaveIndex(RASI); + } + return DAG.getFrameIndex(RASI, PtrVT); +} + +SDValue +PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool isPPC64 = Subtarget.isPPC64(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + + // Get current frame pointer save index. The users of this index will be + // primarily DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); + // Allocate the frame index for frame pointer save area. + FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + return DAG.getFrameIndex(FPSI, PtrVT); +} + +SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + SDLoc dl(Op); + + // Get the corect type for pointers. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + // Negate the size. + SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, + DAG.getConstant(0, dl, PtrVT), Size); + // Construct a node for the frame pointer save index. + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNALLOC node. + SDValue Ops[3] = { Chain, NegSize, FPSIdx }; + SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); + return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); +} + +SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorLoad(Op, DAG); + + assert(Op.getValueType() == MVT::i1 && + "Custom lowering only for i1 loads"); + + // First, load 8 bits into 32 bits, then truncate to 1 bit. + + SDLoc dl(Op); + LoadSDNode *LD = cast<LoadSDNode>(Op); + + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand *MMO = LD->getMemOperand(); + + SDValue NewLD = + DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, + BasePtr, MVT::i8, MMO); + SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); + + SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(1).getValueType().isVector()) + return LowerVectorStore(Op, DAG); + + assert(Op.getOperand(1).getValueType() == MVT::i1 && + "Custom lowering only for i1 stores"); + + // First, zero extend to 32 bits, then use a truncating store to 8 bits. + + SDLoc dl(Op); + StoreSDNode *ST = cast<StoreSDNode>(Op); + + SDValue Chain = ST->getChain(); + SDValue BasePtr = ST->getBasePtr(); + SDValue Value = ST->getValue(); + MachineMemOperand *MMO = ST->getMemOperand(); + + Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), + Value); + return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); +} + +// FIXME: Remove this once the ANDI glue bug is fixed: +SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i1 && + "Custom lowering only for i1 results"); + + SDLoc DL(Op); + return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, + Op.getOperand(0)); +} + +/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when +/// possible. +SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + // Not FP? Not a fsel. + if (!Op.getOperand(0).getValueType().isFloatingPoint() || + !Op.getOperand(2).getValueType().isFloatingPoint()) + return Op; + + // We might be able to do better than this under some circumstances, but in + // general, fsel-based lowering of select is a finite-math-only optimization. + // For more information, see section F.3 of the 2.06 ISA specification. + if (!DAG.getTarget().Options.NoInfsFPMath || + !DAG.getTarget().Options.NoNaNsFPMath) + return Op; + // TODO: Propagate flags from the select rather than global settings. + SDNodeFlags Flags; + Flags.setNoInfs(true); + Flags.setNoNaNs(true); + + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + + EVT ResVT = Op.getValueType(); + EVT CmpVT = Op.getOperand(0).getValueType(); + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); + SDLoc dl(Op); + + // If the RHS of the comparison is a 0.0, we don't need to do the + // subtraction at all. + SDValue Sel1; + if (isFloatingPointZero(RHS)) + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); + case ISD::SETULT: + case ISD::SETLT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETOGE: + case ISD::SETGE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); + case ISD::SETUGT: + case ISD::SETGT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETOLE: + case ISD::SETLE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); + } + + SDValue Cmp; + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); + case ISD::SETULT: + case ISD::SETLT: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + case ISD::SETOGE: + case ISD::SETGE: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + case ISD::SETUGT: + case ISD::SETGT: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + case ISD::SETOLE: + case ISD::SETLE: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + } + return Op; +} + +void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + SDLoc dl) const { + assert(Op.getOperand(0).getValueType().isFloatingPoint()); + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::f32) + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + + SDValue Tmp; + switch (Op.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); + case MVT::i32: + Tmp = DAG.getNode( + Op.getOpcode() == ISD::FP_TO_SINT + ? PPCISD::FCTIWZ + : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), + dl, MVT::f64, Src); + break; + case MVT::i64: + assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && + "i64 FP_TO_UINT is supported only with FPCVT"); + Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ, + dl, MVT::f64, Src); + break; + } + + // Convert the FP value to an int value through memory. + bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && + (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); + SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); + int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + + // Emit a store to the stack slot. + SDValue Chain; + if (i32Stack) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); + SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; + Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); + } else + Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, + MPI, false, false, 0); + + // Result is a load from the stack slot. If loading 4 bytes, make sure to + // add in a bias. + if (Op.getValueType() == MVT::i32 && !i32Stack) { + FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, + DAG.getConstant(4, dl, FIPtr.getValueType())); + MPI = MPI.getWithOffset(4); + } + + RLI.Chain = Chain; + RLI.Ptr = FIPtr; + RLI.MPI = MPI; +} + +/// \brief Custom lowers floating point to integer conversions to use +/// the direct move instructions available in ISA 2.07 to avoid the +/// need for load/store combinations. +SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, + SelectionDAG &DAG, + SDLoc dl) const { + assert(Op.getOperand(0).getValueType().isFloatingPoint()); + SDValue Src = Op.getOperand(0); + + if (Src.getValueType() == MVT::f32) + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + + SDValue Tmp; + switch (Op.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); + case MVT::i32: + Tmp = DAG.getNode( + Op.getOpcode() == ISD::FP_TO_SINT + ? PPCISD::FCTIWZ + : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), + dl, MVT::f64, Src); + Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); + break; + case MVT::i64: + assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && + "i64 FP_TO_UINT is supported only with FPCVT"); + Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ, + dl, MVT::f64, Src); + Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); + break; + } + return Tmp; +} + +SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const { + if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) + return LowerFP_TO_INTDirectMove(Op, DAG, dl); + + ReuseLoadInfo RLI; + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + + return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); +} + +// We're trying to insert a regular store, S, and then a load, L. If the +// incoming value, O, is a load, we might just be able to have our load use the +// address used by O. However, we don't know if anything else will store to +// that address before we can load from it. To prevent this situation, we need +// to insert our load, L, into the chain as a peer of O. To do this, we give L +// the same chain operand as O, we create a token factor from the chain results +// of O and L, and we replace all uses of O's chain result with that token +// factor (see spliceIntoChain below for this last part). +bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, + ReuseLoadInfo &RLI, + SelectionDAG &DAG, + ISD::LoadExtType ET) const { + SDLoc dl(Op); + if (ET == ISD::NON_EXTLOAD && + (Op.getOpcode() == ISD::FP_TO_UINT || + Op.getOpcode() == ISD::FP_TO_SINT) && + isOperationLegalOrCustom(Op.getOpcode(), + Op.getOperand(0).getValueType())) { + + LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); + return true; + } + + LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); + if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || + LD->isNonTemporal()) + return false; + if (LD->getMemoryVT() != MemVT) + return false; + + RLI.Ptr = LD->getBasePtr(); + if (LD->isIndexed() && LD->getOffset().getOpcode() != ISD::UNDEF) { + assert(LD->getAddressingMode() == ISD::PRE_INC && + "Non-pre-inc AM on PPC?"); + RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, + LD->getOffset()); + } + + RLI.Chain = LD->getChain(); + RLI.MPI = LD->getPointerInfo(); + RLI.IsInvariant = LD->isInvariant(); + RLI.Alignment = LD->getAlignment(); + RLI.AAInfo = LD->getAAInfo(); + RLI.Ranges = LD->getRanges(); + + RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); + return true; +} + +// Given the head of the old chain, ResChain, insert a token factor containing +// it and NewResChain, and make users of ResChain now be users of that token +// factor. +void PPCTargetLowering::spliceIntoChain(SDValue ResChain, + SDValue NewResChain, + SelectionDAG &DAG) const { + if (!ResChain) + return; + + SDLoc dl(NewResChain); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + NewResChain, DAG.getUNDEF(MVT::Other)); + assert(TF.getNode() != NewResChain.getNode() && + "A new TF really is required here"); + + DAG.ReplaceAllUsesOfValueWith(ResChain, TF); + DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); +} + +/// \brief Custom lowers integer to floating point conversions to use +/// the direct move instructions available in ISA 2.07 to avoid the +/// need for load/store combinations. +SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, + SelectionDAG &DAG, + SDLoc dl) const { + assert((Op.getValueType() == MVT::f32 || + Op.getValueType() == MVT::f64) && + "Invalid floating point type as target of conversion"); + assert(Subtarget.hasFPCVT() && + "Int to FP conversions with direct moves require FPCVT"); + SDValue FP; + SDValue Src = Op.getOperand(0); + bool SinglePrec = Op.getValueType() == MVT::f32; + bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; + bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; + unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : + (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); + + if (WordInt) { + FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, + dl, MVT::f64, Src); + FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); + } + else { + FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); + FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); + } + + return FP; +} + +SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + + if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { + if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) + return SDValue(); + + SDValue Value = Op.getOperand(0); + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64); + FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, + FPHalfs, FPHalfs); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + if (Op.getValueType() != MVT::v4f64) + Value = DAG.getNode(ISD::FP_ROUND, dl, + Op.getValueType(), Value, + DAG.getIntPtrConstant(1, dl)); + return Value; + } + + // Don't handle ppc_fp128 here; let it be lowered to a libcall. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + + if (Op.getOperand(0).getValueType() == MVT::i1) + return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), + DAG.getConstantFP(1.0, dl, Op.getValueType()), + DAG.getConstantFP(0.0, dl, Op.getValueType())); + + // If we have direct moves, we can do all the conversion, skip the store/load + // however, without FPCVT we can't do most conversions. + if (Subtarget.hasDirectMove() && Subtarget.isPPC64() && Subtarget.hasFPCVT()) + return LowerINT_TO_FPDirectMove(Op, DAG, dl); + + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS + : PPCISD::FCFIDS) + : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU + : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? MVT::f32 + : MVT::f64; + + if (Op.getOperand(0).getValueType() == MVT::i64) { + SDValue SINT = Op.getOperand(0); + // When converting to single-precision, we actually need to convert + // to double-precision first and then round to single-precision. + // To avoid double-rounding effects during that operation, we have + // to prepare the input operand. Bits that might be truncated when + // converting to double-precision are replaced by a bit that won't + // be lost at this stage, but is below the single-precision rounding + // position. + // + // However, if -enable-unsafe-fp-math is in effect, accept double + // rounding to avoid the extra overhead. + if (Op.getValueType() == MVT::f32 && + !Subtarget.hasFPCVT() && + !DAG.getTarget().Options.UnsafeFPMath) { + + // Twiddle input to make sure the low 11 bits are zero. (If this + // is the case, we are guaranteed the value will fit into the 53 bit + // mantissa of an IEEE double-precision value without rounding.) + // If any of those low 11 bits were not zero originally, make sure + // bit 12 (value 2048) is set instead, so that the final rounding + // to single-precision gets the correct result. + SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, + SINT, DAG.getConstant(2047, dl, MVT::i64)); + Round = DAG.getNode(ISD::ADD, dl, MVT::i64, + Round, DAG.getConstant(2047, dl, MVT::i64)); + Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, + Round, DAG.getConstant(-2048, dl, MVT::i64)); + + // However, we cannot use that value unconditionally: if the magnitude + // of the input value is small, the bit-twiddling we did above might + // end up visibly changing the output. Fortunately, in that case, we + // don't need to twiddle bits since the original input will convert + // exactly to double-precision floating-point already. Therefore, + // construct a conditional to use the original value if the top 11 + // bits are all sign-bit copies, and use the rounded value computed + // above otherwise. + SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, + SINT, DAG.getConstant(53, dl, MVT::i32)); + Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, + Cond, DAG.getConstant(1, dl, MVT::i64)); + Cond = DAG.getSetCC(dl, MVT::i32, + Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); + + SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); + } + + ReuseLoadInfo RLI; + SDValue Bits; + + MachineFunction &MF = DAG.getMachineFunction(); + if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { + Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, false, + false, RLI.IsInvariant, RLI.Alignment, RLI.AAInfo, + RLI.Ranges); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasLFIWAX() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (Subtarget.hasFPCVT() && + canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, + DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); + } else if (((Subtarget.hasLFIWAX() && + SINT.getOpcode() == ISD::SIGN_EXTEND) || + (Subtarget.hasFPCVT() && + SINT.getOpcode() == ISD::ZERO_EXTEND)) && + SINT.getOperand(0).getValueType() == MVT::i32) { + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); + + assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + RLI.Alignment = 4; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + } else + Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); + + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) + FP = DAG.getNode(ISD::FP_ROUND, dl, + MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); + return FP; + } + + assert(Op.getOperand(0).getValueType() == MVT::i32 && + "Unhandled INT_TO_FP type in custom expander!"); + // Since we only generate this in 64-bit mode, we can take advantage of + // 64-bit registers. In particular, sign extend the input value into the + // 64-bit register with extsw, store the WHOLE 64-bit value into the stack + // then lfd it and fcfid it. + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + + SDValue Ld; + if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { + ReuseLoadInfo RLI; + bool ReusingLoad; + if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, + DAG))) { + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); + + assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + + RLI.Ptr = FIdx; + RLI.Chain = Store; + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + RLI.Alignment = 4; + } + + MachineMemOperand *MMO = + MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, + RLI.Alignment, RLI.AAInfo, RLI.Ranges); + SDValue Ops[] = { RLI.Chain, RLI.Ptr }; + Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + if (ReusingLoad) + spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); + } else { + assert(Subtarget.isPPC64() && + "i32->FP without LFIWAX supported only on PPC64"); + + int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, + Op.getOperand(0)); + + // STD the extended value into the stack slot. + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Ext64, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); + + // Load the value as a double. + Ld = DAG.getLoad( + MVT::f64, dl, Store, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, false, 0); + } + + // FCFID it and return it. + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) + FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, + DAG.getIntPtrConstant(0, dl)); + return FP; +} + +SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + /* + The rounding mode is in bits 30:31 of FPSR, and has the following + settings: + 00 Round to nearest + 01 Round to 0 + 10 Round to +inf + 11 Round to -inf + + FLT_ROUNDS, on the other hand, expects the following: + -1 Undefined + 0 Round to 0 + 1 Round to nearest + 2 Round to +inf + 3 Round to -inf + + To perform the conversion, we do: + ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) + */ + + MachineFunction &MF = DAG.getMachineFunction(); + EVT VT = Op.getValueType(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + + // Save FP Control Word to register + EVT NodeTys[] = { + MVT::f64, // return register + MVT::Glue // unused in this context + }; + SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); + + // Save FP register to stack slot + int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, + StackSlot, MachinePointerInfo(), false, false,0); + + // Load FP Control Word from low 32 bits of stack slot. + SDValue Four = DAG.getConstant(4, dl, PtrVT); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); + SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(), + false, false, false, 0); + + // Transform as necessary + SDValue CWD1 = + DAG.getNode(ISD::AND, dl, MVT::i32, + CWD, DAG.getConstant(3, dl, MVT::i32)); + SDValue CWD2 = + DAG.getNode(ISD::SRL, dl, MVT::i32, + DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::XOR, dl, MVT::i32, + CWD, DAG.getConstant(3, dl, MVT::i32)), + DAG.getConstant(3, dl, MVT::i32)), + DAG.getConstant(1, dl, MVT::i32)); + + SDValue RetVal = + DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); + + return DAG.getNode((VT.getSizeInBits() < 16 ? + ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); +} + +SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + SDLoc dl(Op); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SHL!"); + + // Expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, dl, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, dl, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); + SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); + SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned BitWidth = VT.getSizeInBits(); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SRL!"); + + // Expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, dl, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, dl, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); + SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); + SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SRA!"); + + // Expand into a bunch of logical ops, followed by a select_cc. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, dl, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, dl, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); + SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); + SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), + Tmp4, Tmp6, ISD::SETLE); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +//===----------------------------------------------------------------------===// +// Vector related lowering. +// + +/// BuildSplatI - Build a canonical splati of Val with an element size of +/// SplatSize. Cast the result to VT. +static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, + SelectionDAG &DAG, SDLoc dl) { + assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); + + static const MVT VTys[] = { // canonical VT to use for each size. + MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 + }; + + EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; + + // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. + if (Val == -1) + SplatSize = 1; + + EVT CanonicalVT = VTys[SplatSize-1]; + + // Build a canonical splat for this value. + SDValue Elt = DAG.getConstant(Val, dl, MVT::i32); + SmallVector<SDValue, 8> Ops; + Ops.assign(CanonicalVT.getVectorNumElements(), Elt); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops); + return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res); +} + +/// BuildIntrinsicOp - Return a unary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, + SelectionDAG &DAG, SDLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, dl, MVT::i32), Op); +} + +/// BuildIntrinsicOp - Return a binary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, + SelectionDAG &DAG, SDLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = LHS.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); +} + +/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, + SDValue Op2, SelectionDAG &DAG, + SDLoc dl, EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op0.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); +} + +/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified +/// amount. The result has the specified value type. +static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, + EVT VT, SelectionDAG &DAG, SDLoc dl) { + // Force LHS/RHS to be the right type. + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); + + int Ops[16]; + for (unsigned i = 0; i != 16; ++i) + Ops[i] = i + Amt; + SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); + return DAG.getNode(ISD::BITCAST, dl, VT, T); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. If we CAN select this case, and if it +// selects to a single instruction, return Op. Otherwise, if we can codegen +// this case more efficiently than a constant pool load, lower it to the +// sequence of ops that should be used. +SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); + + if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { + // We first build an i32 vector, load it into a QPX register, + // then convert it to a floating-point vector and compare it + // to a zero vector to get the boolean result. + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + assert(BVN->getNumOperands() == 4 && + "BUILD_VECTOR for v4i1 does not have 4 operands"); + + bool IsConst = true; + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue; + if (!isa<ConstantSDNode>(BVN->getOperand(i))) { + IsConst = false; + break; + } + } + + if (IsConst) { + Constant *One = + ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); + Constant *NegOne = + ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); + + SmallVector<Constant*, 4> CV(4, NegOne); + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) + CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); + else if (isNullConstant(BVN->getOperand(i))) + continue; + else + CV[i] = One; + } + + Constant *CP = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), + 16 /* alignment */); + + SmallVector<SDValue, 2> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(CPIdx); + + SmallVector<EVT, 2> ValueVTs; + ValueVTs.push_back(MVT::v4i1); + ValueVTs.push_back(MVT::Other); // chain + SDVTList VTs = DAG.getVTList(ValueVTs); + + return DAG.getMemIntrinsicNode( + PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } + + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) continue; + + unsigned Offset = 4*i; + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); + if (StoreSize > 4) { + Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl, + BVN->getOperand(i), Idx, + PtrInfo.getWithOffset(Offset), + MVT::i32, false, false, 0)); + } else { + SDValue StoreValue = BVN->getOperand(i); + if (StoreSize < 4) + StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); + + Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, + StoreValue, Idx, + PtrInfo.getWithOffset(Offset), + false, false, 0)); + } + } + + SDValue StoreChain; + if (!Stores.empty()) + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + else + StoreChain = DAG.getEntryNode(); + + // Now load from v4i32 into the QPX register; this will extend it to + // v4i64 but not yet convert it to a floating point. Nevertheless, this + // is typed as v4f64 because the QPX register integer states are not + // explicitly represented. + + SmallVector<SDValue, 2> Ops; + Ops.push_back(StoreChain); + Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32)); + Ops.push_back(FIdx); + + SmallVector<EVT, 2> ValueVTs; + ValueVTs.push_back(MVT::v4f64); + ValueVTs.push_back(MVT::Other); // chain + SDVTList VTs = DAG.getVTList(ValueVTs); + + SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), + LoadedVect); + + SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::f64); + FPZeros = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, + FPZeros, FPZeros, FPZeros, FPZeros); + + return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); + } + + // All other QPX vectors are handled by generic code. + if (Subtarget.hasQPX()) + return SDValue(); + + // Check if this is a splat of a constant value. + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || + SplatBitSize > 32) + return SDValue(); + + unsigned SplatBits = APSplatBits.getZExtValue(); + unsigned SplatUndef = APSplatUndef.getZExtValue(); + unsigned SplatSize = SplatBitSize / 8; + + // First, handle single instruction cases. + + // All zeros? + if (SplatBits == 0) { + // Canonicalize all zero vectors to be v4i32. + if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { + SDValue Z = DAG.getConstant(0, dl, MVT::i32); + Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z); + Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); + } + return Op; + } + + // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. + int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> + (32-SplatBitSize)); + if (SextVal >= -16 && SextVal <= 15) + return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); + + // Two instruction sequences. + + // If this value is in the range [-32,30] and is even, use: + // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) + // If this value is in the range [17,31] and is odd, use: + // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) + // If this value is in the range [-31,-17] and is odd, use: + // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) + // Note the last two are three-instruction sequences. + if (SextVal >= -32 && SextVal <= 31) { + // To avoid having these optimizations undone by constant folding, + // we convert to a pseudo that will be expanded later into one of + // the above forms. + SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); + EVT VT = (SplatSize == 1 ? MVT::v16i8 : + (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); + SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); + SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); + if (VT == Op.getValueType()) + return RetVal; + else + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); + } + + // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is + // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important + // for fneg/fabs. + if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { + // Make -1 and vspltisw -1: + SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); + + // Make the VSLW intrinsic, computing 0x8000_0000. + SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, + OnesV, DAG, dl); + + // xor by OnesV to invert it. + Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // Check to see if this is a wide variety of vsplti*, binop self cases. + static const signed char SplatCsts[] = { + -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, + -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 + }; + + for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { + // Indirect through the SplatCsts array so that we favor 'vsplti -1' for + // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' + int i = SplatCsts[idx]; + + // Figure out what shift amount will be used by altivec if shifted by i in + // this splat size. + unsigned TypeShiftAmt = i & (SplatBitSize-1); + + // vsplti + shl self. + if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, + Intrinsic::ppc_altivec_vslw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + srl self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, + Intrinsic::ppc_altivec_vsrw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + sra self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, + Intrinsic::ppc_altivec_vsraw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + rol self. + if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | + ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, + Intrinsic::ppc_altivec_vrlw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // t = vsplti c, result = vsldoi t, t, 1 + if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); + } + // t = vsplti c, result = vsldoi t, t, 2 + if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); + } + // t = vsplti c, result = vsldoi t, t, 3 + if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; + return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); + } + } + + return SDValue(); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + SDLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VMRGHW, + OP_VMRGLW, + OP_VSPLTISW0, + OP_VSPLTISW1, + OP_VSPLTISW2, + OP_VSPLTISW3, + OP_VSLDOI4, + OP_VSLDOI8, + OP_VSLDOI12 + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + + int ShufIdxs[16]; + switch (OpNum) { + default: llvm_unreachable("Unknown i32 permute!"); + case OP_VMRGHW: + ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; + ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; + ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; + ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; + break; + case OP_VMRGLW: + ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; + ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; + ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; + ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; + break; + case OP_VSPLTISW0: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+0; + break; + case OP_VSPLTISW1: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+4; + break; + case OP_VSPLTISW2: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+8; + break; + case OP_VSPLTISW3: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+12; + break; + case OP_VSLDOI4: + return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); + case OP_VSLDOI8: + return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); + case OP_VSLDOI12: + return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); + } + EVT VT = OpLHS.getValueType(); + OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); + OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); + SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); + return DAG.getNode(ISD::BITCAST, dl, VT, T); +} + +/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this +/// is a shuffle we can handle in a single instruction, return it. Otherwise, +/// return the code it can be lowered into. Worst case, it can always be +/// lowered into a vperm. +SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + EVT VT = Op.getValueType(); + bool isLittleEndian = Subtarget.isLittleEndian(); + + if (Subtarget.hasQPX()) { + if (VT.getVectorNumElements() != 4) + return SDValue(); + + if (V2.getOpcode() == ISD::UNDEF) V2 = V1; + + int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); + if (AlignIdx != -1) { + return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, + DAG.getConstant(AlignIdx, dl, MVT::i32)); + } else if (SVOp->isSplat()) { + int SplatIdx = SVOp->getSplatIndex(); + if (SplatIdx >= 4) { + std::swap(V1, V2); + SplatIdx -= 4; + } + + // FIXME: If SplatIdx == 0 and the input came from a load, then there is + // nothing to do. + + return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, + DAG.getConstant(SplatIdx, dl, MVT::i32)); + } + + // Lower this into a qvgpci/qvfperm pair. + + // Compute the qvgpci literal + unsigned idx = 0; + for (unsigned i = 0; i < 4; ++i) { + int m = SVOp->getMaskElt(i); + unsigned mm = m >= 0 ? (unsigned) m : i; + idx |= mm << (3-i)*3; + } + + SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, + DAG.getConstant(idx, dl, MVT::i32)); + return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); + } + + // Cases that are handled by instructions that take permute immediates + // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be + // selected by the instruction selector. + if (V2.getOpcode() == ISD::UNDEF) { + if (PPC::isSplatShuffleMask(SVOp, 1) || + PPC::isSplatShuffleMask(SVOp, 2) || + PPC::isSplatShuffleMask(SVOp, 4) || + PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || + (Subtarget.hasP8Altivec() && ( + PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || + PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || + PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { + return Op; + } + } + + // Altivec has a variety of "shuffle immediates" that take two vector inputs + // and produce a fixed permutation. If any of these match, do not lower to + // VPERM. + unsigned int ShuffleKind = isLittleEndian ? 2 : 0; + if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || + (Subtarget.hasP8Altivec() && ( + PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || + PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) + return Op; + + // Check to see if this is a shuffle of 4-byte values. If so, we can use our + // perfect shuffle table to emit an optimal matching sequence. + ArrayRef<int> PermMask = SVOp->getMask(); + + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask[i*4+j] < 0) + continue; // Undef, ignore it. + + unsigned ByteSource = PermMask[i*4+j]; + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } + + if (EltNo == 8) { + EltNo = ByteSource/4; + } else if (EltNo != ByteSource/4) { + isFourElementShuffle = false; + break; + } + } + PFIndexes[i] = EltNo; + } + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle && !isLittleEndian) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be computed. + // For example, if the perm mask can be hoisted out of a loop or is already + // used (perhaps because there are multiple permutes with the same shuffle + // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of + // the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can be + // generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant + // vector that will get spilled to the constant pool. + if (V2.getOpcode() == ISD::UNDEF) V2 = V1; + + // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except + // that it is in input element units, not in bytes. Convert now. + + // For little endian, the order of the input vectors is reversed, and + // the permutation mask is complemented with respect to 31. This is + // necessary to produce proper semantics with the big-endian-biased vperm + // instruction. + EVT EltVT = V1.getValueType().getVectorElementType(); + unsigned BytesPerElement = EltVT.getSizeInBits()/8; + + SmallVector<SDValue, 16> ResultMask; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; + + for (unsigned j = 0; j != BytesPerElement; ++j) + if (isLittleEndian) + ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), + dl, MVT::i32)); + else + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, + MVT::i32)); + } + + SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, + ResultMask); + if (isLittleEndian) + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V2, V1, VPermMask); + else + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V1, V2, VPermMask); +} + +/// getVectorCompareInfo - Given an intrinsic, return false if it is not a +/// vector comparison. If it is, return true and fill in Opc/isDot with +/// information about the intrinsic. +static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, + bool &isDot, const PPCSubtarget &Subtarget) { + unsigned IntrinsicID = + cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); + CompareOpc = -1; + isDot = false; + switch (IntrinsicID) { + default: return false; + // Comparison predicates. + case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequd_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 199; + isDot = 1; + } else + return false; + + break; + case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsd_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 967; + isDot = 1; + } else + return false; + + break; + case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtud_p: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 711; + isDot = 1; + } else + return false; + + break; + // VSX predicate comparisons use the same infrastructure + case Intrinsic::ppc_vsx_xvcmpeqdp_p: + case Intrinsic::ppc_vsx_xvcmpgedp_p: + case Intrinsic::ppc_vsx_xvcmpgtdp_p: + case Intrinsic::ppc_vsx_xvcmpeqsp_p: + case Intrinsic::ppc_vsx_xvcmpgesp_p: + case Intrinsic::ppc_vsx_xvcmpgtsp_p: + if (Subtarget.hasVSX()) { + switch (IntrinsicID) { + case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; + case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; + case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; + case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; + case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; + case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; + } + isDot = 1; + } + else + return false; + + break; + + // Normal Comparisons. + case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequd: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 199; + isDot = 0; + } else + return false; + + break; + case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsd: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 967; + isDot = 0; + } else + return false; + + break; + case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtud: + if (Subtarget.hasP8Altivec()) { + CompareOpc = 711; + isDot = 0; + } else + return false; + + break; + } + return true; +} + +/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom +/// lower, do it, otherwise return null. +SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + // If this is a lowered altivec predicate compare, CompareOpc is set to the + // opcode number of the comparison. + SDLoc dl(Op); + int CompareOpc; + bool isDot; + if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) + return SDValue(); // Don't custom lower most intrinsics. + + // If this is a non-dot comparison, make the VCMP node and we are done. + if (!isDot) { + SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(CompareOpc, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); + } + + // Create the PPCISD altivec 'dot' comparison node. + SDValue Ops[] = { + Op.getOperand(2), // LHS + Op.getOperand(3), // RHS + DAG.getConstant(CompareOpc, dl, MVT::i32) + }; + EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; + SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + + // Now that we have the comparison, emit a copy from the CR to a GPR. + // This is flagged to the above dot comparison. + SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, + DAG.getRegister(PPC::CR6, MVT::i32), + CompNode.getValue(1)); + + // Unpack the result based on how the target uses it. + unsigned BitNo; // Bit # of CR6. + bool InvertBit; // Invert result? + switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Return the value of the EQ bit of CR6. + BitNo = 0; InvertBit = false; + break; + case 1: // Return the inverted value of the EQ bit of CR6. + BitNo = 0; InvertBit = true; + break; + case 2: // Return the value of the LT bit of CR6. + BitNo = 2; InvertBit = false; + break; + case 3: // Return the inverted value of the LT bit of CR6. + BitNo = 2; InvertBit = true; + break; + } + + // Shift the bit into the low position. + Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, + DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); + // Isolate the bit. + Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, + DAG.getConstant(1, dl, MVT::i32)); + + // If we are supposed to, toggle the bit. + if (InvertBit) + Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, + DAG.getConstant(1, dl, MVT::i32)); + return Flags; +} + +SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int + // instructions), but for smaller types, we need to first extend up to v2i32 + // before doing going farther. + if (Op.getValueType() == MVT::v2i64) { + EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + if (ExtVT != MVT::v2i32) { + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); + Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, + DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), + ExtVT.getVectorElementType(), 4))); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); + Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, + DAG.getValueType(MVT::v2i32)); + } + + return Op; + } + + return SDValue(); +} + +SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Create a stack slot that is 16-byte aligned. + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + // Store the input value into Value#0 of the stack slot. + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, + Op.getOperand(0), FIdx, MachinePointerInfo(), + false, false, 0); + // Load it out. + return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(), + false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + SDNode *N = Op.getNode(); + + assert(N->getOperand(0).getValueType() == MVT::v4i1 && + "Unknown extract_vector_elt type"); + + SDValue Value = N->getOperand(0); + + // The first part of this is like the store lowering except that we don't + // need to track the chain. + + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to + // understand how to form the extending load. + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64); + FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, + FPHalfs, FPHalfs, FPHalfs, FPHalfs); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + // Now convert to an integer and store. + Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), + Value); + + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue StoreChain = DAG.getEntryNode(); + SmallVector<SDValue, 2> Ops; + Ops.push_back(StoreChain); + Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32)); + Ops.push_back(Value); + Ops.push_back(FIdx); + + SmallVector<EVT, 2> ValueVTs; + ValueVTs.push_back(MVT::Other); // chain + SDVTList VTs = DAG.getVTList(ValueVTs); + + StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + + // Extract the value requested. + unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + SDValue IntVal = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, + PtrInfo.getWithOffset(Offset), + false, false, false, 0); + + if (!Subtarget.useCRBits()) + return IntVal; + + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); +} + +/// Lowering for QPX v4i1 loads +SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); + SDValue LoadChain = LN->getChain(); + SDValue BasePtr = LN->getBasePtr(); + + if (Op.getValueType() == MVT::v4f64 || + Op.getValueType() == MVT::v4f32) { + EVT MemVT = LN->getMemoryVT(); + unsigned Alignment = LN->getAlignment(); + + // If this load is properly aligned, then it is legal. + if (Alignment >= MemVT.getStoreSize()) + return Op; + + EVT ScalarVT = Op.getValueType().getScalarType(), + ScalarMemVT = MemVT.getScalarType(); + unsigned Stride = ScalarMemVT.getStoreSize(); + + SmallVector<SDValue, 8> Vals, LoadChains; + for (unsigned Idx = 0; Idx < 4; ++Idx) { + SDValue Load; + if (ScalarVT != ScalarMemVT) + Load = + DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, + BasePtr, + LN->getPointerInfo().getWithOffset(Idx*Stride), + ScalarMemVT, LN->isVolatile(), LN->isNonTemporal(), + LN->isInvariant(), MinAlign(Alignment, Idx*Stride), + LN->getAAInfo()); + else + Load = + DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, + LN->getPointerInfo().getWithOffset(Idx*Stride), + LN->isVolatile(), LN->isNonTemporal(), + LN->isInvariant(), MinAlign(Alignment, Idx*Stride), + LN->getAAInfo()); + + if (Idx == 0 && LN->isIndexed()) { + assert(LN->getAddressingMode() == ISD::PRE_INC && + "Unknown addressing mode on vector load"); + Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), + LN->getAddressingMode()); + } + + Vals.push_back(Load); + LoadChains.push_back(Load.getValue(1)); + + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, dl, + BasePtr.getValueType())); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl, + Op.getValueType(), Vals); + + if (LN->isIndexed()) { + SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; + return DAG.getMergeValues(RetOps, dl); + } + + SDValue RetOps[] = { Value, TF }; + return DAG.getMergeValues(RetOps, dl); + } + + assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); + assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); + + // To lower v4i1 from a byte array, we load the byte elements of the + // vector and then reuse the BUILD_VECTOR logic. + + SmallVector<SDValue, 4> VectElmts, VectElmtChains; + for (unsigned i = 0; i < 4; ++i) { + SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); + + VectElmts.push_back(DAG.getExtLoad(ISD::EXTLOAD, + dl, MVT::i32, LoadChain, Idx, + LN->getPointerInfo().getWithOffset(i), + MVT::i8 /* memory type */, + LN->isVolatile(), LN->isNonTemporal(), + LN->isInvariant(), + 1 /* alignment */, LN->getAAInfo())); + VectElmtChains.push_back(VectElmts[i].getValue(1)); + } + + LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); + SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i1, VectElmts); + + SDValue RVals[] = { Value, LoadChain }; + return DAG.getMergeValues(RVals, dl); +} + +/// Lowering for QPX v4i1 stores +SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + StoreSDNode *SN = cast<StoreSDNode>(Op.getNode()); + SDValue StoreChain = SN->getChain(); + SDValue BasePtr = SN->getBasePtr(); + SDValue Value = SN->getValue(); + + if (Value.getValueType() == MVT::v4f64 || + Value.getValueType() == MVT::v4f32) { + EVT MemVT = SN->getMemoryVT(); + unsigned Alignment = SN->getAlignment(); + + // If this store is properly aligned, then it is legal. + if (Alignment >= MemVT.getStoreSize()) + return Op; + + EVT ScalarVT = Value.getValueType().getScalarType(), + ScalarMemVT = MemVT.getScalarType(); + unsigned Stride = ScalarMemVT.getStoreSize(); + + SmallVector<SDValue, 8> Stores; + for (unsigned Idx = 0; Idx < 4; ++Idx) { + SDValue Ex = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, + DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); + SDValue Store; + if (ScalarVT != ScalarMemVT) + Store = + DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, + SN->getPointerInfo().getWithOffset(Idx*Stride), + ScalarMemVT, SN->isVolatile(), SN->isNonTemporal(), + MinAlign(Alignment, Idx*Stride), SN->getAAInfo()); + else + Store = + DAG.getStore(StoreChain, dl, Ex, BasePtr, + SN->getPointerInfo().getWithOffset(Idx*Stride), + SN->isVolatile(), SN->isNonTemporal(), + MinAlign(Alignment, Idx*Stride), SN->getAAInfo()); + + if (Idx == 0 && SN->isIndexed()) { + assert(SN->getAddressingMode() == ISD::PRE_INC && + "Unknown addressing mode on vector store"); + Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), + SN->getAddressingMode()); + } + + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, dl, + BasePtr.getValueType())); + Stores.push_back(Store); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + + if (SN->isIndexed()) { + SDValue RetOps[] = { TF, Stores[0].getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + } + + return TF; + } + + assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); + assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); + + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to + // understand how to form the extending load. + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64); + FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, + FPHalfs, FPHalfs, FPHalfs, FPHalfs); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + // Now convert to an integer and store. + Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), + Value); + + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SmallVector<SDValue, 2> Ops; + Ops.push_back(StoreChain); + Ops.push_back(DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32)); + Ops.push_back(Value); + Ops.push_back(FIdx); + + SmallVector<EVT, 2> ValueVTs; + ValueVTs.push_back(MVT::Other); // chain + SDVTList VTs = DAG.getVTList(ValueVTs); + + StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + + // Move data into the byte array. + SmallVector<SDValue, 4> Loads, LoadChains; + for (unsigned i = 0; i < 4; ++i) { + unsigned Offset = 4*i; + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + Loads.push_back(DAG.getLoad(MVT::i32, dl, StoreChain, Idx, + PtrInfo.getWithOffset(Offset), + false, false, false, 0)); + LoadChains.push_back(Loads[i].getValue(1)); + } + + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i < 4; ++i) { + SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); + + Stores.push_back(DAG.getTruncStore( + StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), + MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(), + 1 /* alignment */, SN->getAAInfo())); + } + + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + + return StoreChain; +} + +SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + if (Op.getValueType() == MVT::v4i32) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); + SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. + + SDValue RHSSwap = // = vrlw RHS, 16 + BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); + + // Shrinkify inputs to v8i16. + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); + RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); + + // Low parts multiplied together, generating 32-bit results (we ignore the + // top parts). + SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, + LHS, RHS, DAG, dl, MVT::v4i32); + + SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, + LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); + // Shift the high parts up 16 bits. + HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, + Neg16, DAG, dl); + return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); + } else if (Op.getValueType() == MVT::v8i16) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); + + return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, + LHS, RHS, Zero, DAG, dl); + } else if (Op.getValueType() == MVT::v16i8) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + bool isLittleEndian = Subtarget.isLittleEndian(); + + // Multiply the even 8-bit parts, producing 16-bit sums. + SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, + LHS, RHS, DAG, dl, MVT::v8i16); + EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); + + // Multiply the odd 8-bit parts, producing 16-bit sums. + SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, + LHS, RHS, DAG, dl, MVT::v8i16); + OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); + + // Merge the results together. Because vmuleub and vmuloub are + // instructions with a big-endian bias, we must reverse the + // element numbering and reverse the meaning of "odd" and "even" + // when generating little endian code. + int Ops[16]; + for (unsigned i = 0; i != 8; ++i) { + if (isLittleEndian) { + Ops[i*2 ] = 2*i; + Ops[i*2+1] = 2*i+16; + } else { + Ops[i*2 ] = 2*i+1; + Ops[i*2+1] = 2*i+1+16; + } + } + if (isLittleEndian) + return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); + else + return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); + } else { + llvm_unreachable("Unknown mul to lower!"); + } +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG, Subtarget); + + case ISD::VAARG: + return LowerVAARG(Op, DAG, Subtarget); + + case ISD::VACOPY: + return LowerVACOPY(Op, DAG, Subtarget); + + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget); + + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, + SDLoc(Op)); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + + // Lower 64-bit shifts. + case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); + case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); + case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + + // Vector-related lowering. + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + + // For counter-based loop handling. + case ISD::INTRINSIC_W_CHAIN: return SDValue(); + + // Frame & Return address. + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + } +} + +void PPCTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + SDLoc dl(N); + switch (N->getOpcode()) { + default: + llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::READCYCLECOUNTER: { + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); + + Results.push_back(RTB); + Results.push_back(RTB.getValue(1)); + Results.push_back(RTB.getValue(2)); + break; + } + case ISD::INTRINSIC_W_CHAIN: { + if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != + Intrinsic::ppc_is_decremented_ctr_nonzero) + break; + + assert(N->getValueType(0) == MVT::i1 && + "Unexpected result type for CTR decrement intrinsic"); + EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + N->getValueType(0)); + SDVTList VTs = DAG.getVTList(SVT, MVT::Other); + SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), + N->getOperand(1)); + + Results.push_back(NewInt); + Results.push_back(NewInt.getValue(1)); + break; + } + case ISD::VAARG: { + if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) + return; + + EVT VT = N->getValueType(0); + + if (VT == MVT::i64) { + SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget); + + Results.push_back(NewNode); + Results.push_back(NewNode.getValue(1)); + } + return; + } + case ISD::FP_ROUND_INREG: { + assert(N->getValueType(0) == MVT::ppcf128); + assert(N->getOperand(0).getValueType() == MVT::ppcf128); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, + MVT::f64, N->getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, + MVT::f64, N->getOperand(0), + DAG.getIntPtrConstant(1, dl)); + + // Add the two halves of the long double in round-to-zero mode. + SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); + + // We know the low half is about to be thrown away, so just use something + // convenient. + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, + FPreg, FPreg)); + return; + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + // LowerFP_TO_INT() can only handle f32 and f64. + if (N->getOperand(0).getValueType() == MVT::ppcf128) + return; + Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); + return; + } +} + +//===----------------------------------------------------------------------===// +// Other Lowering Code +//===----------------------------------------------------------------------===// + +static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *Func = Intrinsic::getDeclaration(M, Id); + return Builder.CreateCall(Func, {}); +} + +// The mappings for emitLeading/TrailingFence is taken from +// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (Ord == SequentiallyConsistent) + return callIntrinsic(Builder, Intrinsic::ppc_sync); + if (isAtLeastRelease(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + return nullptr; +} + +Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (IsLoad && isAtLeastAcquire(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + // FIXME: this is too conservative, a dependent branch + isync is enough. + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + return nullptr; +} + +MachineBasicBlock * +PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + unsigned AtomicSize, + unsigned BinOpcode) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + auto LoadMnemonic = PPC::LDARX; + auto StoreMnemonic = PPC::STDCX; + switch (AtomicSize) { + default: + llvm_unreachable("Unexpected size of atomic entity"); + case 1: + LoadMnemonic = PPC::LBARX; + StoreMnemonic = PPC::STBCX; + assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); + break; + case 2: + LoadMnemonic = PPC::LHARX; + StoreMnemonic = PPC::STHCX; + assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); + break; + case 4: + LoadMnemonic = PPC::LWARX; + StoreMnemonic = PPC::STWCX; + break; + case 8: + LoadMnemonic = PPC::LDARX; + StoreMnemonic = PPC::STDCX; + break; + } + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = ++BB->getIterator(); + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned incr = MI->getOperand(3).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loopMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned TmpReg = (!BinOpcode) ? incr : + RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // l[wd]arx dest, ptr + // add r0, dest, incr + // st[wd]cx. r0, ptr + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + BuildMI(BB, dl, TII->get(LoadMnemonic), dest) + .addReg(ptrA).addReg(ptrB); + if (BinOpcode) + BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); + BuildMI(BB, dl, TII->get(StoreMnemonic)) + .addReg(TmpReg).addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + return BB; +} + +MachineBasicBlock * +PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, + MachineBasicBlock *BB, + bool is8bit, // operation + unsigned BinOpcode) const { + // If we support part-word atomic mnemonics, just use them + if (Subtarget.hasPartwordAtomics()) + return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode); + + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + // In 64 bit mode we have to use 64 bits for addresses, even though the + // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address + // registers without caring whether they're 32 or 64, but here we're + // doing actual arithmetic on the addresses. + bool is64bit = Subtarget.isPPC64(); + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = ++BB->getIterator(); + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned incr = MI->getOperand(3).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loopMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned ShiftReg = RegInfo.createVirtualRegister(RC); + unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); + unsigned MaskReg = RegInfo.createVirtualRegister(RC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + unsigned Ptr1Reg; + unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // The 4-byte load must be aligned, while a char or short may be + // anywhere in the word. Hence all this nasty bookkeeping code. + // add ptr1, ptrA, ptrB [copy if ptrA==0] + // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] + // xori shift, shift1, 24 [16] + // rlwinm ptr, ptr1, 0, 0, 29 + // slw incr2, incr, shift + // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] + // slw mask, mask2, shift + // loopMBB: + // lwarx tmpDest, ptr + // add tmp, tmpDest, incr2 + // andc tmp2, tmpDest, mask + // and tmp3, tmp, mask + // or tmp4, tmp3, tmp2 + // stwcx. tmp4, ptr + // bne- loopMBB + // fallthrough --> exitMBB + // srw dest, tmpDest, shift + if (ptrA != ZeroReg) { + Ptr1Reg = RegInfo.createVirtualRegister(RC); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) + .addReg(ptrA).addReg(ptrB); + } else { + Ptr1Reg = ptrB; + } + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) + .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) + .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + if (is64bit) + BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(61); + else + BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); + BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) + .addReg(incr).addReg(ShiftReg); + if (is8bit) + BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); + else { + BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); + BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); + } + BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) + .addReg(Mask2Reg).addReg(ShiftReg); + + BB = loopMBB; + BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + if (BinOpcode) + BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) + .addReg(Incr2Reg).addReg(TmpDestReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) + .addReg(TmpReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) + .addReg(Tmp3Reg).addReg(Tmp2Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)) + .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) + .addReg(ShiftReg); + return BB; +} + +llvm::MachineBasicBlock* +PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = ++MBB->getIterator(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + // For v = setjmp(buf), we generate + // + // thisMBB: + // SjLjSetup mainMBB + // bl mainMBB + // v_restore = 1 + // b sinkMBB + // + // mainMBB: + // buf[LabelOffset] = LR + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Note that the structure of the jmp_buf used here is not compatible + // with that used by libc, and is not designed to be. Specifically, it + // stores only those 'reserved' registers that LLVM does not otherwise + // understand how to spill. Also, by convention, by the time this + // intrinsic is called, Clang has already stored the frame address in the + // first slot of the buffer and stack address in the third. Following the + // X86 target code, we'll store the jump address in the second slot. We also + // need to save the TOC pointer (R2) to handle jumps between shared + // libraries, and that will be stored in the fourth slot. The thread + // identifier (R13) is not affected. + + // thisMBB: + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); + + // Prepare IP either in reg. + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + unsigned LabelReg = MRI.createVirtualRegister(PtrRC); + unsigned BufReg = MI->getOperand(1).getReg(); + + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { + setUsesTOCBasePtr(*MBB->getParent()); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) + .addReg(PPC::X2) + .addImm(TOCOffset) + .addReg(BufReg); + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Naked functions never have a base pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned BaseReg; + if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) + BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; + else + BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; + + MIB = BuildMI(*thisMBB, MI, DL, + TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) + .addReg(BaseReg) + .addImm(BPOffset) + .addReg(BufReg); + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); + MIB.addRegMask(TRI->getNoPreservedMask()); + + BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); + + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) + .addMBB(mainMBB); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); + + thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); + thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); + + // mainMBB: + // mainDstReg = 0 + MIB = + BuildMI(mainMBB, DL, + TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + + // Store IP + if (Subtarget.isPPC64()) { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) + .addReg(LabelReg) + .addImm(LabelOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) + .addReg(LabelReg) + .addImm(LabelOffset) + .addReg(BufReg); + } + + MIB.setMemRefs(MMOBegin, MMOEnd); + + BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(PPC::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(thisMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; + unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; + unsigned BP = + (PVT == MVT::i64) + ? PPC::X30 + : (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ + ? PPC::R29 + : PPC::R30); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); + + unsigned BufReg = MI->getOperand(0).getReg(); + + // Reload FP (the jumped-to function may not have had a + // frame pointer, and if so, then its r31 will be restored + // as necessary). + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) + .addImm(0) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) + .addImm(0) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload IP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) + .addImm(LabelOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) + .addImm(LabelOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload SP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) + .addImm(SPOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) + .addImm(SPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload BP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) + .addImm(BPOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) + .addImm(BPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload TOC + if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { + setUsesTOCBasePtr(*MBB->getParent()); + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) + .addImm(TOCOffset) + .addReg(BufReg); + + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Jump + BuildMI(*MBB, MI, DL, + TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); + BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * +PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + if (MI->getOpcode() == TargetOpcode::STACKMAP || + MI->getOpcode() == TargetOpcode::PATCHPOINT) { + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && + MI->getOpcode() == TargetOpcode::PATCHPOINT) { + // Call lowering should have added an r2 operand to indicate a dependence + // on the TOC base pointer value. It can't however, because there is no + // way to mark the dependence as implicit there, and so the stackmap code + // will confuse it with a regular operand. Instead, add the dependence + // here. + setUsesTOCBasePtr(*BB->getParent()); + MI->addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); + } + + return emitPatchPoint(MI, BB); + } + + if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 || + MI->getOpcode() == PPC::EH_SjLj_SetJmp64) { + return emitEHSjLjSetJmp(MI, BB); + } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 || + MI->getOpcode() == PPC::EH_SjLj_LongJmp64) { + return emitEHSjLjLongJmp(MI, BB); + } + + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + // To "insert" these instructions we actually have to insert their + // control-flow patterns. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + MachineFunction *F = BB->getParent(); + + if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8)) { + SmallVector<MachineOperand, 2> Cond; + if (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8) + Cond.push_back(MI->getOperand(4)); + else + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(MI->getOperand(1)); + + DebugLoc dl = MI->getDebugLoc(); + TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), + Cond, MI->getOperand(2).getReg(), + MI->getOperand(3).getReg()); + } else if (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_CC_F4 || + MI->getOpcode() == PPC::SELECT_CC_F8 || + MI->getOpcode() == PPC::SELECT_CC_QFRC || + MI->getOpcode() == PPC::SELECT_CC_QSRC || + MI->getOpcode() == PPC::SELECT_CC_QBRC || + MI->getOpcode() == PPC::SELECT_CC_VRRC || + MI->getOpcode() == PPC::SELECT_CC_VSFRC || + MI->getOpcode() == PPC::SELECT_CC_VSSRC || + MI->getOpcode() == PPC::SELECT_CC_VSRC || + MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8 || + MI->getOpcode() == PPC::SELECT_F4 || + MI->getOpcode() == PPC::SELECT_F8 || + MI->getOpcode() == PPC::SELECT_QFRC || + MI->getOpcode() == PPC::SELECT_QSRC || + MI->getOpcode() == PPC::SELECT_QBRC || + MI->getOpcode() == PPC::SELECT_VRRC || + MI->getOpcode() == PPC::SELECT_VSFRC || + MI->getOpcode() == PPC::SELECT_VSSRC || + MI->getOpcode() == PPC::SELECT_VSRC) { + // The incoming instruction knows the destination vreg to set, the + // condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + DebugLoc dl = MI->getDebugLoc(); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Next, add the true and fallthrough blocks as its successors. + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + if (MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8 || + MI->getOpcode() == PPC::SELECT_F4 || + MI->getOpcode() == PPC::SELECT_F8 || + MI->getOpcode() == PPC::SELECT_QFRC || + MI->getOpcode() == PPC::SELECT_QSRC || + MI->getOpcode() == PPC::SELECT_QBRC || + MI->getOpcode() == PPC::SELECT_VRRC || + MI->getOpcode() == PPC::SELECT_VSFRC || + MI->getOpcode() == PPC::SELECT_VSSRC || + MI->getOpcode() == PPC::SELECT_VSRC) { + BuildMI(BB, dl, TII->get(PPC::BC)) + .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); + } else { + unsigned SelectPred = MI->getOperand(4).getImm(); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); + } + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(*BB, BB->begin(), dl, + TII->get(PPC::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + } else if (MI->getOpcode() == PPC::ReadTB) { + // To read the 64-bit time-base register on a 32-bit target, we read the + // two halves. Should the counter have wrapped while it was being read, we + // need to try again. + // ... + // readLoop: + // mfspr Rx,TBU # load from TBU + // mfspr Ry,TB # load from TB + // mfspr Rz,TBU # load from TBU + // cmpw crX,Rx,Rz # check if 'old'='new' + // bne readLoop # branch if they're not equal + // ... + + MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + DebugLoc dl = MI->getDebugLoc(); + F->insert(It, readMBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(readMBB); + BB = readMBB; + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); + unsigned LoReg = MI->getOperand(0).getReg(); + unsigned HiReg = MI->getOperand(1).getReg(); + + BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); + BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); + BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); + + unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + + BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) + .addReg(HiReg).addReg(ReadAgainReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); + + BB->addSuccessor(readMBB); + BB->addSuccessor(sinkMBB); + } + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) + BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) + BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); + + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32) + BB = EmitAtomicBinary(MI, BB, 4, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64) + BB = EmitAtomicBinary(MI, BB, 8, 0); + + else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || + (Subtarget.hasPartwordAtomics() && + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || + (Subtarget.hasPartwordAtomics() && + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { + bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; + + auto LoadMnemonic = PPC::LDARX; + auto StoreMnemonic = PPC::STDCX; + switch(MI->getOpcode()) { + default: + llvm_unreachable("Compare and swap of unknown size"); + case PPC::ATOMIC_CMP_SWAP_I8: + LoadMnemonic = PPC::LBARX; + StoreMnemonic = PPC::STBCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I16: + LoadMnemonic = PPC::LHARX; + StoreMnemonic = PPC::STHCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I32: + LoadMnemonic = PPC::LWARX; + StoreMnemonic = PPC::STWCX; + break; + case PPC::ATOMIC_CMP_SWAP_I64: + LoadMnemonic = PPC::LDARX; + StoreMnemonic = PPC::STDCX; + break; + } + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned oldval = MI->getOperand(3).getReg(); + unsigned newval = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loop1MBB); + F->insert(It, loop2MBB); + F->insert(It, midMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // l[bhwd]arx dest, ptr + // cmp[wd] dest, oldval + // bne- midMBB + // loop2MBB: + // st[bhwd]cx. newval, ptr + // bne- loopMBB + // b exitBB + // midMBB: + // st[bhwd]cx. dest, ptr + // exitBB: + BB = loop1MBB; + BuildMI(BB, dl, TII->get(LoadMnemonic), dest) + .addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) + .addReg(oldval).addReg(dest); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(midMBB); + + BB = loop2MBB; + BuildMI(BB, dl, TII->get(StoreMnemonic)) + .addReg(newval).addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + BB = midMBB; + BuildMI(BB, dl, TII->get(StoreMnemonic)) + .addReg(dest).addReg(ptrA).addReg(ptrB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { + // We must use 64-bit registers for addresses when targeting 64-bit, + // since we're actually doing arithmetic on them. Other registers + // can be 32-bit. + bool is64bit = Subtarget.isPPC64(); + bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned oldval = MI->getOperand(3).getReg(); + unsigned newval = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loop1MBB); + F->insert(It, loop2MBB); + F->insert(It, midMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned ShiftReg = RegInfo.createVirtualRegister(RC); + unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); + unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); + unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); + unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); + unsigned MaskReg = RegInfo.createVirtualRegister(RC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + unsigned Ptr1Reg; + unsigned TmpReg = RegInfo.createVirtualRegister(RC); + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loop1MBB); + + // The 4-byte load must be aligned, while a char or short may be + // anywhere in the word. Hence all this nasty bookkeeping code. + // add ptr1, ptrA, ptrB [copy if ptrA==0] + // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] + // xori shift, shift1, 24 [16] + // rlwinm ptr, ptr1, 0, 0, 29 + // slw newval2, newval, shift + // slw oldval2, oldval,shift + // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] + // slw mask, mask2, shift + // and newval3, newval2, mask + // and oldval3, oldval2, mask + // loop1MBB: + // lwarx tmpDest, ptr + // and tmp, tmpDest, mask + // cmpw tmp, oldval3 + // bne- midMBB + // loop2MBB: + // andc tmp2, tmpDest, mask + // or tmp4, tmp2, newval3 + // stwcx. tmp4, ptr + // bne- loop1MBB + // b exitBB + // midMBB: + // stwcx. tmpDest, ptr + // exitBB: + // srw dest, tmpDest, shift + if (ptrA != ZeroReg) { + Ptr1Reg = RegInfo.createVirtualRegister(RC); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) + .addReg(ptrA).addReg(ptrB); + } else { + Ptr1Reg = ptrB; + } + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) + .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) + .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + if (is64bit) + BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(61); + else + BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); + BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) + .addReg(newval).addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) + .addReg(oldval).addReg(ShiftReg); + if (is8bit) + BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); + else { + BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); + BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) + .addReg(Mask3Reg).addImm(65535); + } + BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) + .addReg(Mask2Reg).addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) + .addReg(NewVal2Reg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) + .addReg(OldVal2Reg).addReg(MaskReg); + + BB = loop1MBB; + BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) + .addReg(TmpReg).addReg(OldVal3Reg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(midMBB); + + BB = loop2MBB; + BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) + .addReg(Tmp2Reg).addReg(NewVal3Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) + .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + BB = midMBB; + BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) + .addReg(ShiftReg); + } else if (MI->getOpcode() == PPC::FADDrtz) { + // This pseudo performs an FADD with rounding mode temporarily forced + // to round-to-zero. We emit this via custom inserter since the FPSCR + // is not modeled at the SelectionDAG level. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + unsigned Src2 = MI->getOperand(2).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); + + // Set rounding mode to round-to-zero. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); + + // Perform addition. + BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); + + // Restore FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); + } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT || + MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) { + unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ? + PPC::ANDIo8 : PPC::ANDIo; + bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT || + MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? + &PPC::GPRCRegClass : + &PPC::G8RCRegClass); + + DebugLoc dl = MI->getDebugLoc(); + BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) + .addReg(MI->getOperand(1).getReg()).addImm(1); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), + MI->getOperand(0).getReg()) + .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); + } else if (MI->getOpcode() == PPC::TCHECK_RET) { + DebugLoc Dl = MI->getDebugLoc(); + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); + return BB; + } else { + llvm_unreachable("Unexpected instr type to insert"); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +static std::string getRecipOp(const char *Base, EVT VT) { + std::string RecipOp(Base); + if (VT.getScalarType() == MVT::f64) + RecipOp += "d"; + else + RecipOp += "f"; + + if (VT.isVector()) + RecipOp = "vec-" + RecipOp; + + return RecipOp; +} + +SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + EVT VT = Operand.getValueType(); + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX()) || + (VT == MVT::v4f32 && Subtarget.hasQPX()) || + (VT == MVT::v4f64 && Subtarget.hasQPX())) { + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + std::string RecipOp = getRecipOp("sqrt", VT); + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = true; + return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); + } + return SDValue(); +} + +SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + EVT VT = Operand.getValueType(); + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX()) || + (VT == MVT::v4f32 && Subtarget.hasQPX()) || + (VT == MVT::v4f64 && Subtarget.hasQPX())) { + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + std::string RecipOp = getRecipOp("div", VT); + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); + } + return SDValue(); +} + +unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { + // Note: This functionality is used only when unsafe-fp-math is enabled, and + // on cores with reciprocal estimates (which are used when unsafe-fp-math is + // enabled for division), this functionality is redundant with the default + // combiner logic (once the division -> reciprocal/multiply transformation + // has taken place). As a result, this matters more for older cores than for + // newer ones. + + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are two or more FDIVs (for embedded cores with only + // one FP pipeline) for three or more FDIVs (for generic OOO cores). + switch (Subtarget.getDarwinDirective()) { + default: + return 3; + case PPC::DIR_440: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + return 2; + } +} + +// isConsecutiveLSLoc needs to work even if all adds have not yet been +// collapsed, and so we need to look through chains of them. +static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, + int64_t& Offset, SelectionDAG &DAG) { + if (DAG.isBaseWithConstantOffset(Loc)) { + Base = Loc.getOperand(0); + Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); + + // The base might itself be a base plus an offset, and if so, accumulate + // that as well. + getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); + } +} + +static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (VT.getSizeInBits() / 8 != Bytes) + return false; + + SDValue BaseLoc = Base->getBasePtr(); + if (Loc.getOpcode() == ISD::FrameIndex) { + if (BaseLoc.getOpcode() != ISD::FrameIndex) + return false; + const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); + int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); + int FS = MFI->getObjectSize(FI); + int BFS = MFI->getObjectSize(BFI); + if (FS != BFS || FS != (int)Bytes) return false; + return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); + } + + SDValue Base1 = Loc, Base2 = BaseLoc; + int64_t Offset1 = 0, Offset2 = 0; + getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); + getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); + if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) + return true; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const GlobalValue *GV1 = nullptr; + const GlobalValue *GV2 = nullptr; + Offset1 = 0; + Offset2 = 0; + bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); + bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); + if (isGA1 && isGA2 && GV1 == GV2) + return Offset1 == (Offset2 + Dist*Bytes); + return false; +} + +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { + EVT VT = LS->getMemoryVT(); + SDValue Loc = LS->getBasePtr(); + return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + EVT VT; + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_qpx_qvlfd: + case Intrinsic::ppc_qpx_qvlfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfs: + case Intrinsic::ppc_qpx_qvlfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcd: + case Intrinsic::ppc_qpx_qvlfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcs: + case Intrinsic::ppc_qpx_qvlfcsa: + VT = MVT::v2f32; + break; + case Intrinsic::ppc_qpx_qvlfiwa: + case Intrinsic::ppc_qpx_qvlfiwz: + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_vsx_lxvw4x: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); + } + + if (N->getOpcode() == ISD::INTRINSIC_VOID) { + EVT VT; + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: return false; + case Intrinsic::ppc_qpx_qvstfd: + case Intrinsic::ppc_qpx_qvstfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfs: + case Intrinsic::ppc_qpx_qvstfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcd: + case Intrinsic::ppc_qpx_qvstfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcs: + case Intrinsic::ppc_qpx_qvstfcsa: + VT = MVT::v2f32; + break; + case Intrinsic::ppc_qpx_qvstfiw: + case Intrinsic::ppc_qpx_qvstfiwa: + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_vsx_stxvw4x: + VT = MVT::v4i32; + break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + } + + return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); + } + + return false; +} + +// Return true is there is a nearyby consecutive load to the one provided +// (regardless of alignment). We search up and down the chain, looking though +// token factors and other loads (but nothing else). As a result, a true result +// indicates that it is safe to create a new consecutive load adjacent to the +// load provided. +static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { + SDValue Chain = LD->getChain(); + EVT VT = LD->getMemoryVT(); + + SmallSet<SDNode *, 16> LoadRoots; + SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); + SmallSet<SDNode *, 16> Visited; + + // First, search up the chain, branching to follow all token-factor operands. + // If we find a consecutive load, then we're done, otherwise, record all + // nodes just above the top-level loads and token factors. + while (!Queue.empty()) { + SDNode *ChainNext = Queue.pop_back_val(); + if (!Visited.insert(ChainNext).second) + continue; + + if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + if (!Visited.count(ChainLD->getChain().getNode())) + Queue.push_back(ChainLD->getChain().getNode()); + } else if (ChainNext->getOpcode() == ISD::TokenFactor) { + for (const SDUse &O : ChainNext->ops()) + if (!Visited.count(O.getNode())) + Queue.push_back(O.getNode()); + } else + LoadRoots.insert(ChainNext); + } + + // Second, search down the chain, starting from the top-level nodes recorded + // in the first phase. These top-level nodes are the nodes just above all + // loads and token factors. Starting with their uses, recursively look though + // all loads (just the chain uses) and token factors to find a consecutive + // load. + Visited.clear(); + Queue.clear(); + + for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), + IE = LoadRoots.end(); I != IE; ++I) { + Queue.push_back(*I); + + while (!Queue.empty()) { + SDNode *LoadRoot = Queue.pop_back_val(); + if (!Visited.insert(LoadRoot).second) + continue; + + if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + for (SDNode::use_iterator UI = LoadRoot->use_begin(), + UE = LoadRoot->use_end(); UI != UE; ++UI) + if (((isa<MemSDNode>(*UI) && + cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || + UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) + Queue.push_back(*UI); + } + } + + return false; +} + +SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + + assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); + // If we're tracking CR bits, we need to be careful that we don't have: + // trunc(binary-ops(zext(x), zext(y))) + // or + // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) + // such that we're unnecessarily moving things into GPRs when it would be + // better to keep them in CR bits. + + // Note that trunc here can be an actual i1 trunc, or can be the effective + // truncation that comes from a setcc or select_cc. + if (N->getOpcode() == ISD::TRUNCATE && + N->getValueType(0) != MVT::i1) + return SDValue(); + + if (N->getOperand(0).getValueType() != MVT::i32 && + N->getOperand(0).getValueType() != MVT::i64) + return SDValue(); + + if (N->getOpcode() == ISD::SETCC || + N->getOpcode() == ISD::SELECT_CC) { + // If we're looking at a comparison, then we need to make sure that the + // high bits (all except for the first) don't matter the result. + ISD::CondCode CC = + cast<CondCodeSDNode>(N->getOperand( + N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); + unsigned OpBits = N->getOperand(0).getValueSizeInBits(); + + if (ISD::isSignedIntSetCC(CC)) { + if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || + DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) + return SDValue(); + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (!DAG.MaskedValueIsZero(N->getOperand(0), + APInt::getHighBitsSet(OpBits, OpBits-1)) || + !DAG.MaskedValueIsZero(N->getOperand(1), + APInt::getHighBitsSet(OpBits, OpBits-1))) + return SDValue(); + } else { + // This is neither a signed nor an unsigned comparison, just make sure + // that the high bits are equal. + APInt Op1Zero, Op1One; + APInt Op2Zero, Op2One; + DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One); + DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One); + + // We don't really care about what is known about the first bit (if + // anything), so clear it in all masks prior to comparing them. + Op1Zero.clearBit(0); Op1One.clearBit(0); + Op2Zero.clearBit(0); Op2One.clearBit(0); + + if (Op1Zero != Op2Zero || Op1One != Op2One) + return SDValue(); + } + } + + // We now know that the higher-order bits are irrelevant, we just need to + // make sure that all of the intermediate operations are bit operations, and + // all inputs are extensions. + if (N->getOperand(0).getOpcode() != ISD::AND && + N->getOperand(0).getOpcode() != ISD::OR && + N->getOperand(0).getOpcode() != ISD::XOR && + N->getOperand(0).getOpcode() != ISD::SELECT && + N->getOperand(0).getOpcode() != ISD::SELECT_CC && + N->getOperand(0).getOpcode() != ISD::TRUNCATE && + N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && + N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + + if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && + N->getOperand(1).getOpcode() != ISD::AND && + N->getOperand(1).getOpcode() != ISD::OR && + N->getOperand(1).getOpcode() != ISD::XOR && + N->getOperand(1).getOpcode() != ISD::SELECT && + N->getOperand(1).getOpcode() != ISD::SELECT_CC && + N->getOperand(1).getOpcode() != ISD::TRUNCATE && + N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && + N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + + SmallVector<SDValue, 4> Inputs; + SmallVector<SDValue, 8> BinOps, PromOps; + SmallPtrSet<SDNode *, 16> Visited; + + for (unsigned i = 0; i < 2; ++i) { + if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && + N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || + isa<ConstantSDNode>(N->getOperand(i))) + Inputs.push_back(N->getOperand(i)); + else + BinOps.push_back(N->getOperand(i)); + + if (N->getOpcode() == ISD::TRUNCATE) + break; + } + + // Visit all inputs, collect all binary operations (and, or, xor and + // select) that are all fed by extensions. + while (!BinOps.empty()) { + SDValue BinOp = BinOps.back(); + BinOps.pop_back(); + + if (!Visited.insert(BinOp.getNode()).second) + continue; + + PromOps.push_back(BinOp); + + for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { + // The condition of the select is not promoted. + if (BinOp.getOpcode() == ISD::SELECT && i == 0) + continue; + if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) + continue; + + if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && + BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || + isa<ConstantSDNode>(BinOp.getOperand(i))) { + Inputs.push_back(BinOp.getOperand(i)); + } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || + BinOp.getOperand(i).getOpcode() == ISD::OR || + BinOp.getOperand(i).getOpcode() == ISD::XOR || + BinOp.getOperand(i).getOpcode() == ISD::SELECT || + BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || + BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || + BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { + BinOps.push_back(BinOp.getOperand(i)); + } else { + // We have an input that is not an extension or another binary + // operation; we'll abort this transformation. + return SDValue(); + } + } + } + + // Make sure that this is a self-contained cluster of operations (which + // is not quite the same thing as saying that everything has only one + // use). + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), + UE = Inputs[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == Inputs[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == Inputs[i] || + User->getOperand(1) == Inputs[i]) + return SDValue(); + } + } + } + + for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { + for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), + UE = PromOps[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == PromOps[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == PromOps[i] || + User->getOperand(1) == PromOps[i]) + return SDValue(); + } + } + } + + // Replace all inputs with the extension operand. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + // Constants may have users outside the cluster of to-be-promoted nodes, + // and so we need to replace those as we do the promotions. + if (isa<ConstantSDNode>(Inputs[i])) + continue; + else + DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); + } + + // Replace all operations (these are all the same, but have a different + // (i1) return type). DAG.getNode will validate that the types of + // a binary operator match, so go through the list in reverse so that + // we've likely promoted both operands first. Any intermediate truncations or + // extensions disappear. + while (!PromOps.empty()) { + SDValue PromOp = PromOps.back(); + PromOps.pop_back(); + + if (PromOp.getOpcode() == ISD::TRUNCATE || + PromOp.getOpcode() == ISD::SIGN_EXTEND || + PromOp.getOpcode() == ISD::ZERO_EXTEND || + PromOp.getOpcode() == ISD::ANY_EXTEND) { + if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && + PromOp.getOperand(0).getValueType() != MVT::i1) { + // The operand is not yet ready (see comment below). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + SDValue RepValue = PromOp.getOperand(0); + if (isa<ConstantSDNode>(RepValue)) + RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); + + DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); + continue; + } + + unsigned C; + switch (PromOp.getOpcode()) { + default: C = 0; break; + case ISD::SELECT: C = 1; break; + case ISD::SELECT_CC: C = 2; break; + } + + if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && + PromOp.getOperand(C).getValueType() != MVT::i1) || + (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && + PromOp.getOperand(C+1).getValueType() != MVT::i1)) { + // The to-be-promoted operands of this node have not yet been + // promoted (this should be rare because we're going through the + // list backward, but if one of the operands has several users in + // this cluster of to-be-promoted nodes, it is possible). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), + PromOp.getNode()->op_end()); + + // If there are any constant inputs, make sure they're replaced now. + for (unsigned i = 0; i < 2; ++i) + if (isa<ConstantSDNode>(Ops[C+i])) + Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); + + DAG.ReplaceAllUsesOfValueWith(PromOp, + DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); + } + + // Now we're left with the initial truncation itself. + if (N->getOpcode() == ISD::TRUNCATE) + return N->getOperand(0); + + // Otherwise, this is a comparison. The operands to be compared have just + // changed type (to i1), but everything else is the same. + return SDValue(N, 0); +} + +SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + + // If we're tracking CR bits, we need to be careful that we don't have: + // zext(binary-ops(trunc(x), trunc(y))) + // or + // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) + // such that we're unnecessarily moving things into CR bits that can more + // efficiently stay in GPRs. Note that if we're not certain that the high + // bits are set as required by the final extension, we still may need to do + // some masking to get the proper behavior. + + // This same functionality is important on PPC64 when dealing with + // 32-to-64-bit extensions; these occur often when 32-bit values are used as + // the return values of functions. Because it is so similar, it is handled + // here as well. + + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return SDValue(); + + if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || + (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::AND && + N->getOperand(0).getOpcode() != ISD::OR && + N->getOperand(0).getOpcode() != ISD::XOR && + N->getOperand(0).getOpcode() != ISD::SELECT && + N->getOperand(0).getOpcode() != ISD::SELECT_CC) + return SDValue(); + + SmallVector<SDValue, 4> Inputs; + SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; + SmallPtrSet<SDNode *, 16> Visited; + + // Visit all inputs, collect all binary operations (and, or, xor and + // select) that are all fed by truncations. + while (!BinOps.empty()) { + SDValue BinOp = BinOps.back(); + BinOps.pop_back(); + + if (!Visited.insert(BinOp.getNode()).second) + continue; + + PromOps.push_back(BinOp); + + for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { + // The condition of the select is not promoted. + if (BinOp.getOpcode() == ISD::SELECT && i == 0) + continue; + if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) + continue; + + if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || + isa<ConstantSDNode>(BinOp.getOperand(i))) { + Inputs.push_back(BinOp.getOperand(i)); + } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || + BinOp.getOperand(i).getOpcode() == ISD::OR || + BinOp.getOperand(i).getOpcode() == ISD::XOR || + BinOp.getOperand(i).getOpcode() == ISD::SELECT || + BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { + BinOps.push_back(BinOp.getOperand(i)); + } else { + // We have an input that is not a truncation or another binary + // operation; we'll abort this transformation. + return SDValue(); + } + } + } + + // The operands of a select that must be truncated when the select is + // promoted because the operand is actually part of the to-be-promoted set. + DenseMap<SDNode *, EVT> SelectTruncOp[2]; + + // Make sure that this is a self-contained cluster of operations (which + // is not quite the same thing as saying that everything has only one + // use). + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), + UE = Inputs[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // If we're going to promote the non-output-value operand(s) or SELECT or + // SELECT_CC, record them for truncation. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == Inputs[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == Inputs[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + if (User->getOperand(1) == Inputs[i]) + SelectTruncOp[1].insert(std::make_pair(User, + User->getOperand(1).getValueType())); + } + } + } + + for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { + for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), + UE = PromOps[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // If we're going to promote the non-output-value operand(s) or SELECT or + // SELECT_CC, record them for truncation. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == PromOps[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == PromOps[i]) + SelectTruncOp[0].insert(std::make_pair(User, + User->getOperand(0).getValueType())); + if (User->getOperand(1) == PromOps[i]) + SelectTruncOp[1].insert(std::make_pair(User, + User->getOperand(1).getValueType())); + } + } + } + + unsigned PromBits = N->getOperand(0).getValueSizeInBits(); + bool ReallyNeedsExt = false; + if (N->getOpcode() != ISD::ANY_EXTEND) { + // If all of the inputs are not already sign/zero extended, then + // we'll still need to do that at the end. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + unsigned OpBits = + Inputs[i].getOperand(0).getValueSizeInBits(); + assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); + + if ((N->getOpcode() == ISD::ZERO_EXTEND && + !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), + APInt::getHighBitsSet(OpBits, + OpBits-PromBits))) || + (N->getOpcode() == ISD::SIGN_EXTEND && + DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < + (OpBits-(PromBits-1)))) { + ReallyNeedsExt = true; + break; + } + } + } + + // Replace all inputs, either with the truncation operand, or a + // truncation or extension to the final output type. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + // Constant inputs need to be replaced with the to-be-promoted nodes that + // use them because they might have users outside of the cluster of + // promoted nodes. + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + SDValue InSrc = Inputs[i].getOperand(0); + if (Inputs[i].getValueType() == N->getValueType(0)) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); + else if (N->getOpcode() == ISD::SIGN_EXTEND) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); + else + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); + } + + // Replace all operations (these are all the same, but have a different + // (promoted) return type). DAG.getNode will validate that the types of + // a binary operator match, so go through the list in reverse so that + // we've likely promoted both operands first. + while (!PromOps.empty()) { + SDValue PromOp = PromOps.back(); + PromOps.pop_back(); + + unsigned C; + switch (PromOp.getOpcode()) { + default: C = 0; break; + case ISD::SELECT: C = 1; break; + case ISD::SELECT_CC: C = 2; break; + } + + if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && + PromOp.getOperand(C).getValueType() != N->getValueType(0)) || + (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && + PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { + // The to-be-promoted operands of this node have not yet been + // promoted (this should be rare because we're going through the + // list backward, but if one of the operands has several users in + // this cluster of to-be-promoted nodes, it is possible). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + // For SELECT and SELECT_CC nodes, we do a similar check for any + // to-be-promoted comparison inputs. + if (PromOp.getOpcode() == ISD::SELECT || + PromOp.getOpcode() == ISD::SELECT_CC) { + if ((SelectTruncOp[0].count(PromOp.getNode()) && + PromOp.getOperand(0).getValueType() != N->getValueType(0)) || + (SelectTruncOp[1].count(PromOp.getNode()) && + PromOp.getOperand(1).getValueType() != N->getValueType(0))) { + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + } + + SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), + PromOp.getNode()->op_end()); + + // If this node has constant inputs, then they'll need to be promoted here. + for (unsigned i = 0; i < 2; ++i) { + if (!isa<ConstantSDNode>(Ops[C+i])) + continue; + if (Ops[C+i].getValueType() == N->getValueType(0)) + continue; + + if (N->getOpcode() == ISD::SIGN_EXTEND) + Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + else + Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + } + + // If we've promoted the comparison inputs of a SELECT or SELECT_CC, + // truncate them again to the original value type. + if (PromOp.getOpcode() == ISD::SELECT || + PromOp.getOpcode() == ISD::SELECT_CC) { + auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); + if (SI0 != SelectTruncOp[0].end()) + Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); + auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); + if (SI1 != SelectTruncOp[1].end()) + Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); + } + + DAG.ReplaceAllUsesOfValueWith(PromOp, + DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); + } + + // Now we're left with the initial extension itself. + if (!ReallyNeedsExt) + return N->getOperand(0); + + // To zero extend, just mask off everything except for the first bit (in the + // i1 case). + if (N->getOpcode() == ISD::ZERO_EXTEND) + return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), + DAG.getConstant(APInt::getLowBitsSet( + N->getValueSizeInBits(0), PromBits), + dl, N->getValueType(0))); + + assert(N->getOpcode() == ISD::SIGN_EXTEND && + "Invalid extension type"); + EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); + SDValue ShiftCst = + DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); + return DAG.getNode( + ISD::SRA, dl, N->getValueType(0), + DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), + ShiftCst); +} + +SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::UINT_TO_FP) && + "Need an int -> FP conversion node here"); + + if (!Subtarget.has64BitSupport()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Op(N, 0); + + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + + // For i32 intermediate values, unfortunately, the conversion functions + // leave the upper 32 bits of the value are undefined. Within the set of + // scalar instructions, we have no method for zero- or sign-extending the + // value. Thus, we cannot handle i32 intermediate values here. + if (Op.getOperand(0).getValueType() == MVT::i32) + return SDValue(); + + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS + : PPCISD::FCFIDS) + : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU + : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) + ? MVT::f32 + : MVT::f64; + + // If we're converting from a float, to an int, and back to a float again, + // then we don't need the store/load pair at all. + if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && + Subtarget.hasFPCVT()) || + (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { + SDValue Src = Op.getOperand(0).getOperand(0); + if (Src.getValueType() == MVT::f32) { + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + DCI.AddToWorklist(Src.getNode()); + } else if (Src.getValueType() != MVT::f64) { + // Make sure that we don't pick up a ppc_fp128 source value. + return SDValue(); + } + + unsigned FCTOp = + Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ; + + SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); + + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { + FP = DAG.getNode(ISD::FP_ROUND, dl, + MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); + DCI.AddToWorklist(FP.getNode()); + } + + return FP; + } + + return SDValue(); +} + +// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for +// builtins) into loads with swaps. +SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX load"); + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + Chain = LD->getChain(); + Base = LD->getBasePtr(); + MMO = LD->getMemOperand(); + // If the MMO suggests this isn't a load of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_W_CHAIN: { + MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); + Chain = Intrin->getChain(); + // Similarly to the store case below, Intrin->getBasePtr() doesn't get + // us what we want. Get operand 2 instead. + Base = Intrin->getOperand(2); + MMO = Intrin->getMemOperand(); + break; + } + } + + MVT VecTy = N->getValueType(0).getSimpleVT(); + SDValue LoadOps[] = { Chain, Base }; + SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, + DAG.getVTList(VecTy, MVT::Other), + LoadOps, VecTy, MMO); + DCI.AddToWorklist(Load.getNode()); + Chain = Load.getValue(1); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Load); + DCI.AddToWorklist(Swap.getNode()); + return Swap; +} + +// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for +// builtins) into stores with swaps. +SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + unsigned SrcOpnd; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX store"); + case ISD::STORE: { + StoreSDNode *ST = cast<StoreSDNode>(N); + Chain = ST->getChain(); + Base = ST->getBasePtr(); + MMO = ST->getMemOperand(); + SrcOpnd = 1; + // If the MMO suggests this isn't a store of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_VOID: { + MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); + Chain = Intrin->getChain(); + // Intrin->getBasePtr() oddly does not get what we want. + Base = Intrin->getOperand(3); + MMO = Intrin->getMemOperand(); + SrcOpnd = 2; + break; + } + } + + SDValue Src = N->getOperand(SrcOpnd); + MVT VecTy = Src.getValueType().getSimpleVT(); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Src); + DCI.AddToWorklist(Swap.getNode()); + Chain = Swap.getValue(1); + SDValue StoreOps[] = { Chain, Swap, Base }; + SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, + DAG.getVTList(MVT::Other), + StoreOps, VecTy, MMO); + DCI.AddToWorklist(Store.getNode()); + return Store; +} + +SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + switch (N->getOpcode()) { + default: break; + case PPCISD::SHL: + if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. + return N->getOperand(0); + break; + case PPCISD::SRL: + if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. + return N->getOperand(0); + break; + case PPCISD::SRA: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->isNullValue() || // 0 >>s V -> 0. + C->isAllOnesValue()) // -1 >>s V -> -1. + return N->getOperand(0); + } + break; + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + return DAGCombineExtBoolTrunc(N, DCI); + case ISD::TRUNCATE: + case ISD::SETCC: + case ISD::SELECT_CC: + return DAGCombineTruncBoolExt(N, DCI); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return combineFPToIntToFP(N, DCI); + case ISD::STORE: { + // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). + if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() && + N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && + N->getOperand(1).getValueType() == MVT::i32 && + N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { + SDValue Val = N->getOperand(1).getOperand(0); + if (Val.getValueType() == MVT::f32) { + Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + } + Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + + SDValue Ops[] = { + N->getOperand(0), Val, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + + Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, + cast<StoreSDNode>(N)->getMemoryVT(), + cast<StoreSDNode>(N)->getMemOperand()); + DCI.AddToWorklist(Val.getNode()); + return Val; + } + + // Turn STORE (BSWAP) -> sthbrx/stwbrx. + if (cast<StoreSDNode>(N)->isUnindexed() && + N->getOperand(1).getOpcode() == ISD::BSWAP && + N->getOperand(1).getNode()->hasOneUse() && + (N->getOperand(1).getValueType() == MVT::i32 || + N->getOperand(1).getValueType() == MVT::i16 || + (Subtarget.hasLDBRX() && Subtarget.isPPC64() && + N->getOperand(1).getValueType() == MVT::i64))) { + SDValue BSwapOp = N->getOperand(1).getOperand(0); + // Do an any-extend to 32-bits if this is a half-word input. + if (BSwapOp.getValueType() == MVT::i16) + BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); + + SDValue Ops[] = { + N->getOperand(0), BSwapOp, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + return + DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), + Ops, cast<StoreSDNode>(N)->getMemoryVT(), + cast<StoreSDNode>(N)->getMemOperand()); + } + + // For little endian, VSX stores require generating xxswapd/lxvd2x. + EVT VT = N->getOperand(1).getValueType(); + if (VT.isSimple()) { + MVT StoreVT = VT.getSimpleVT(); + if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && + (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || + StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) + return expandVSXStoreForLE(N, DCI); + } + break; + } + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT VT = LD->getValueType(0); + + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (VT.isSimple()) { + MVT LoadVT = VT.getSimpleVT(); + if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && + (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || + LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) + return expandVSXLoadForLE(N, DCI); + } + + EVT MemVT = LD->getMemoryVT(); + Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); + Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); + unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); + if (LD->isUnindexed() && VT.isVector() && + ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && + // P8 and later hardware should just use LOAD. + !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v4f32)) || + (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && + LD->getAlignment() >= ScalarABIAlignment)) && + LD->getAlignment() < ABIAlignment) { + // This is a type-legal unaligned Altivec or QPX load. + SDValue Chain = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + bool isLittleEndian = Subtarget.isLittleEndian(); + + // This implements the loading of unaligned vectors as described in + // the venerable Apple Velocity Engine overview. Specifically: + // https://developer.apple.com/hardwaredrivers/ve/alignment.html + // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html + // + // The general idea is to expand a sequence of one or more unaligned + // loads into an alignment-based permutation-control instruction (lvsl + // or lvsr), a series of regular vector loads (which always truncate + // their input address to an aligned address), and a series of + // permutations. The results of these permutations are the requested + // loaded values. The trick is that the last "extra" load is not taken + // from the address you might suspect (sizeof(vector) bytes after the + // last requested load), but rather sizeof(vector) - 1 bytes after the + // last requested vector. The point of this is to avoid a page fault if + // the base address happened to be aligned. This works because if the + // base address is aligned, then adding less than a full vector length + // will cause the last vector in the sequence to be (re)loaded. + // Otherwise, the next vector will be fetched as you might suspect was + // necessary. + + // We might be able to reuse the permutation generation from + // a different base address offset from this one by an aligned amount. + // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this + // optimization later. + Intrinsic::ID Intr, IntrLD, IntrPerm; + MVT PermCntlTy, PermTy, LDTy; + if (Subtarget.hasAltivec()) { + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; + } else { + Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : + Intrinsic::ppc_qpx_qvlpcls; + IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : + Intrinsic::ppc_qpx_qvlfs; + IntrPerm = Intrinsic::ppc_qpx_qvfperm; + PermCntlTy = MVT::v4f64; + PermTy = MVT::v4f64; + LDTy = MemVT.getSimpleVT(); + } + + SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); + + // Create the new MMO for the new base load. It is like the original MMO, + // but represents an area in memory almost twice the vector size centered + // on the original address. If the address is unaligned, we might start + // reading up to (sizeof(vector)-1) bytes below the address of the + // original unaligned load. + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *BaseMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + -(long)MemVT.getStoreSize()+1, + 2*MemVT.getStoreSize()-1); + + // Create the new base load. + SDValue LDXIntID = + DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); + SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue BaseLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(PermTy, MVT::Other), + BaseLoadOps, LDTy, BaseMMO); + + // Note that the value of IncOffset (which is provided to the next + // load's pointer info offset value, and thus used to calculate the + // alignment), and the value of IncValue (which is actually used to + // increment the pointer value) are different! This is because we + // require the next load to appear to be aligned, even though it + // is actually offset from the base pointer by a lesser amount. + int IncOffset = VT.getSizeInBits() / 8; + int IncValue = IncOffset; + + // Walk (both up and down) the chain looking for another load at the real + // (aligned) offset (the alignment of the other load does not matter in + // this case). If found, then do not use the offset reduction trick, as + // that will prevent the loads from being later combined (as they would + // otherwise be duplicates). + if (!findConsecutiveLoad(LD, DAG)) + --IncValue; + + SDValue Increment = + DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + + MachineMemOperand *ExtraMMO = + MF.getMachineMemOperand(LD->getMemOperand(), + 1, 2*MemVT.getStoreSize()-1); + SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; + SDValue ExtraLoad = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, + DAG.getVTList(PermTy, MVT::Other), + ExtraLoadOps, LDTy, ExtraMMO); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + BaseLoad.getValue(1), ExtraLoad.getValue(1)); + + // Because vperm has a big-endian bias, we must reverse the order + // of the input vectors and complement the permute control vector + // when generating little endian code. We have already handled the + // latter by using lvsr instead of lvsl, so just reverse BaseLoad + // and ExtraLoad here. + SDValue Perm; + if (isLittleEndian) + Perm = BuildIntrinsicOp(IntrPerm, + ExtraLoad, BaseLoad, PermCntl, DAG, dl); + else + Perm = BuildIntrinsicOp(IntrPerm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); + + if (VT != PermTy) + Perm = Subtarget.hasAltivec() ? + DAG.getNode(ISD::BITCAST, dl, VT, Perm) : + DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX + DAG.getTargetConstant(1, dl, MVT::i64)); + // second argument is 1 because this rounding + // is always exact. + + // The output of the permutation is our loaded result, the TokenFactor is + // our new chain. + DCI.CombineTo(N, Perm, TF); + return SDValue(N, 0); + } + } + break; + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = Subtarget.isLittleEndian(); + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl); + if ((IID == Intr || + IID == Intrinsic::ppc_qpx_qvlpcld || + IID == Intrinsic::ppc_qpx_qvlpcls) && + N->getOperand(1)->getOpcode() == ISD::ADD) { + SDValue Add = N->getOperand(1); + + int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? + 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; + + if (DAG.MaskedValueIsZero( + Add->getOperand(1), + APInt::getAllOnesValue(Bits /* alignment */) + .zext( + Add.getValueType().getScalarType().getSizeInBits()))) { + SDNode *BasePtr = Add->getOperand(0).getNode(); + for (SDNode::use_iterator UI = BasePtr->use_begin(), + UE = BasePtr->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { + // We've found another LVSL/LVSR, and this address is an aligned + // multiple of that one. The results will be the same, so use the + // one we've just found instead. + + return SDValue(*UI, 0); + } + } + } + + if (isa<ConstantSDNode>(Add->getOperand(1))) { + SDNode *BasePtr = Add->getOperand(0).getNode(); + for (SDNode::use_iterator UI = BasePtr->use_begin(), + UE = BasePtr->use_end(); UI != UE; ++UI) { + if (UI->getOpcode() == ISD::ADD && + isa<ConstantSDNode>(UI->getOperand(1)) && + (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - + cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) % + (1ULL << Bits) == 0) { + SDNode *OtherAdd = *UI; + for (SDNode::use_iterator VI = OtherAdd->use_begin(), + VE = OtherAdd->use_end(); VI != VE; ++VI) { + if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) { + return SDValue(*VI, 0); + } + } + } + } + } + } + } + + break; + case ISD::INTRINSIC_W_CHAIN: { + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: + return expandVSXLoadForLE(N, DCI); + } + } + break; + } + case ISD::INTRINSIC_VOID: { + // For little endian, VSX stores require generating xxswapd/stxvd2x. + if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: + return expandVSXStoreForLE(N, DCI); + } + } + break; + } + case ISD::BSWAP: + // Turn BSWAP (LOAD) -> lhbrx/lwbrx. + if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && + N->getOperand(0).hasOneUse() && + (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || + (Subtarget.hasLDBRX() && Subtarget.isPPC64() && + N->getValueType(0) == MVT::i64))) { + SDValue Load = N->getOperand(0); + LoadSDNode *LD = cast<LoadSDNode>(Load); + // Create the byte-swapping load. + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(N->getValueType(0)) // VT + }; + SDValue BSLoad = + DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, + DAG.getVTList(N->getValueType(0) == MVT::i64 ? + MVT::i64 : MVT::i32, MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + + // If this is an i16 load, insert the truncate. + SDValue ResVal = BSLoad; + if (N->getValueType(0) == MVT::i16) + ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); + + // First, combine the bswap away. This makes the value produced by the + // load dead. + DCI.CombineTo(N, ResVal); + + // Next, combine the load away, we give it a bogus result value but a real + // chain result. The result value is dead because the bswap is dead. + DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); + + // Return N so it doesn't get rechecked! + return SDValue(N, 0); + } + + break; + case PPCISD::VCMP: { + // If a VCMPo node already exists with exactly the same operands as this + // node, use its result instead of this node (VCMPo computes both a CR6 and + // a normal output). + // + if (!N->getOperand(0).hasOneUse() && + !N->getOperand(1).hasOneUse() && + !N->getOperand(2).hasOneUse()) { + + // Scan all of the users of the LHS, looking for VCMPo's that match. + SDNode *VCMPoNode = nullptr; + + SDNode *LHSN = N->getOperand(0).getNode(); + for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); + UI != E; ++UI) + if (UI->getOpcode() == PPCISD::VCMPo && + UI->getOperand(1) == N->getOperand(1) && + UI->getOperand(2) == N->getOperand(2) && + UI->getOperand(0) == N->getOperand(0)) { + VCMPoNode = *UI; + break; + } + + // If there is no VCMPo node, or if the flag value has a single use, don't + // transform this. + if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) + break; + + // Look at the (necessarily single) use of the flag value. If it has a + // chain, this transformation is more complex. Note that multiple things + // could use the value result, which we should ignore. + SDNode *FlagUser = nullptr; + for (SDNode::use_iterator UI = VCMPoNode->use_begin(); + FlagUser == nullptr; ++UI) { + assert(UI != VCMPoNode->use_end() && "Didn't find user!"); + SDNode *User = *UI; + for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { + if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { + FlagUser = User; + break; + } + } + } + + // If the user is a MFOCRF instruction, we know this is safe. + // Otherwise we give up for right now. + if (FlagUser->getOpcode() == PPCISD::MFOCRF) + return SDValue(VCMPoNode, 0); + } + break; + } + case ISD::BRCOND: { + SDValue Cond = N->getOperand(1); + SDValue Target = N->getOperand(2); + + if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero) { + + // We now need to make the intrinsic dead (it cannot be instruction + // selected). + DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); + assert(Cond.getNode()->hasOneUse() && + "Counter decrement has more than one use"); + + return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, + N->getOperand(0), Target); + } + } + break; + case ISD::BR_CC: { + // If this is a branch on an altivec predicate comparison, lower this so + // that we don't have to do a MFOCRF: instead, branch directly on CR6. This + // lowering is done pre-legalize, because the legalizer lowers the predicate + // compare down to code that is difficult to reassemble. + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); + + // Sometimes the promoted value of the intrinsic is ANDed by some non-zero + // value. If so, pass-through the AND to get to the intrinsic. + if (LHS.getOpcode() == ISD::AND && + LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa<ConstantSDNode>(LHS.getOperand(1)) && + !isNullConstant(LHS.getOperand(1))) + LHS = LHS.getOperand(0); + + if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa<ConstantSDNode>(RHS)) { + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && + "Counter decrement comparison is not EQ or NE"); + + unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); + bool isBDNZ = (CC == ISD::SETEQ && Val) || + (CC == ISD::SETNE && !Val); + + // We now need to make the intrinsic dead (it cannot be instruction + // selected). + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); + assert(LHS.getNode()->hasOneUse() && + "Counter decrement has more than one use"); + + return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + + int CompareOpc; + bool isDot; + + if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && + isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && + getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { + assert(isDot && "Can't compare against a vector result!"); + + // If this is a comparison against something other than 0/1, then we know + // that the condition is never/always true. + unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (Val != 0 && Val != 1) { + if (CC == ISD::SETEQ) // Cond never true, remove branch. + return N->getOperand(0); + // Always !=, turn it into an unconditional branch. + return DAG.getNode(ISD::BR, dl, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + + bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); + + // Create the PPCISD altivec 'dot' comparison node. + SDValue Ops[] = { + LHS.getOperand(2), // LHS of compare + LHS.getOperand(3), // RHS of compare + DAG.getConstant(CompareOpc, dl, MVT::i32) + }; + EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; + SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + + // Unpack the result based on how the target uses it. + PPC::Predicate CompOpc; + switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Branch on the value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; + break; + case 1: // Branch on the inverted value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; + break; + case 2: // Branch on the value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; + break; + case 3: // Branch on the inverted value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; + break; + } + + return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), + DAG.getConstant(CompOpc, dl, MVT::i32), + DAG.getRegister(PPC::CR6, MVT::i32), + N->getOperand(4), CompNode.getValue(1)); + } + break; + } + } + + return SDValue(); +} + +SDValue +PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + std::vector<SDNode *> *Created) const { + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + if (VT == MVT::i64 && !Subtarget.isPPC64()) + return SDValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || + !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + + bool IsNegPow2 = (-Divisor).isPowerOf2(); + unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); + SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); + + SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); + if (Created) + Created->push_back(Op.getNode()); + + if (IsNegPow2) { + Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); + if (Created) + Created->push_back(Op.getNode()); + } + + return Op; +} + +//===----------------------------------------------------------------------===// +// Inline Assembly Support +//===----------------------------------------------------------------------===// + +void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0); + switch (Op.getOpcode()) { + default: break; + case PPCISD::LBRX: { + // lhbrx is known to have the top bits cleared out. + if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) + KnownZero = 0xFFFF0000; + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + default: break; + case Intrinsic::ppc_altivec_vcmpbfp_p: + case Intrinsic::ppc_altivec_vcmpeqfp_p: + case Intrinsic::ppc_altivec_vcmpequb_p: + case Intrinsic::ppc_altivec_vcmpequh_p: + case Intrinsic::ppc_altivec_vcmpequw_p: + case Intrinsic::ppc_altivec_vcmpequd_p: + case Intrinsic::ppc_altivec_vcmpgefp_p: + case Intrinsic::ppc_altivec_vcmpgtfp_p: + case Intrinsic::ppc_altivec_vcmpgtsb_p: + case Intrinsic::ppc_altivec_vcmpgtsh_p: + case Intrinsic::ppc_altivec_vcmpgtsw_p: + case Intrinsic::ppc_altivec_vcmpgtsd_p: + case Intrinsic::ppc_altivec_vcmpgtub_p: + case Intrinsic::ppc_altivec_vcmpgtuh_p: + case Intrinsic::ppc_altivec_vcmpgtuw_p: + case Intrinsic::ppc_altivec_vcmpgtud_p: + KnownZero = ~1U; // All bits but the low one are known to be zero. + break; + } + } + } +} + +unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + switch (Subtarget.getDarwinDirective()) { + default: break; + case PPC::DIR_970: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: { + if (!ML) + break; + + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); + + // For small loops (between 5 and 8 instructions), align to a 32-byte + // boundary so that the entire loop fits in one instruction-cache line. + uint64_t LoopSize = 0; + for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { + LoopSize += TII->GetInstSizeInBytes(J); + if (LoopSize > 32) + break; + } + + if (LoopSize > 16 && LoopSize <= 32) + return 5; + + break; + } + } + + return TargetLowering::getPrefLoopAlignment(ML); +} + +/// getConstraintType - Given a constraint, return the type of +/// constraint it is for this target. +PPCTargetLowering::ConstraintType +PPCTargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'b': + case 'r': + case 'f': + case 'v': + case 'y': + return C_RegisterClass; + case 'Z': + // FIXME: While Z does indicate a memory constraint, it specifically + // indicates an r+r address (used in conjunction with the 'y' modifier + // in the replacement string). Currently, we're forcing the base + // register to be r0 in the asm printer (which is interpreted as zero) + // and forming the complete address in the second register. This is + // suboptimal. + return C_Memory; + } + } else if (Constraint == "wc") { // individual CR bits. + return C_RegisterClass; + } else if (Constraint == "wa" || Constraint == "wd" || + Constraint == "wf" || Constraint == "ws") { + return C_RegisterClass; // VSX registers. + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +PPCTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + + // Look at the constraint type. + if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) + return CW_Register; // an individual CR bit. + else if ((StringRef(constraint) == "wa" || + StringRef(constraint) == "wd" || + StringRef(constraint) == "wf") && + type->isVectorTy()) + return CW_Register; + else if (StringRef(constraint) == "ws" && type->isDoubleTy()) + return CW_Register; + + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'b': + if (type->isIntegerTy()) + weight = CW_Register; + break; + case 'f': + if (type->isFloatTy()) + weight = CW_Register; + break; + case 'd': + if (type->isDoubleTy()) + weight = CW_Register; + break; + case 'v': + if (type->isVectorTy()) + weight = CW_Register; + break; + case 'y': + weight = CW_Register; + break; + case 'Z': + weight = CW_Memory; + break; + } + return weight; +} + +std::pair<unsigned, const TargetRegisterClass *> +PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + if (Constraint.size() == 1) { + // GCC RS6000 Constraint Letters + switch (Constraint[0]) { + case 'b': // R1-R31 + if (VT == MVT::i64 && Subtarget.isPPC64()) + return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); + return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); + case 'r': // R0-R31 + if (VT == MVT::i64 && Subtarget.isPPC64()) + return std::make_pair(0U, &PPC::G8RCRegClass); + return std::make_pair(0U, &PPC::GPRCRegClass); + case 'f': + if (VT == MVT::f32 || VT == MVT::i32) + return std::make_pair(0U, &PPC::F4RCRegClass); + if (VT == MVT::f64 || VT == MVT::i64) + return std::make_pair(0U, &PPC::F8RCRegClass); + if (VT == MVT::v4f64 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QFRCRegClass); + if (VT == MVT::v4f32 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QSRCRegClass); + break; + case 'v': + if (VT == MVT::v4f64 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QFRCRegClass); + if (VT == MVT::v4f32 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QSRCRegClass); + if (Subtarget.hasAltivec()) + return std::make_pair(0U, &PPC::VRRCRegClass); + case 'y': // crrc + return std::make_pair(0U, &PPC::CRRCRegClass); + } + } else if (Constraint == "wc" && Subtarget.useCRBits()) { + // An individual CR bit. + return std::make_pair(0U, &PPC::CRBITRCRegClass); + } else if ((Constraint == "wa" || Constraint == "wd" || + Constraint == "wf") && Subtarget.hasVSX()) { + return std::make_pair(0U, &PPC::VSRCRegClass); + } else if (Constraint == "ws" && Subtarget.hasVSX()) { + if (VT == MVT::f32 && Subtarget.hasP8Vector()) + return std::make_pair(0U, &PPC::VSSRCRegClass); + else + return std::make_pair(0U, &PPC::VSFRCRegClass); + } + + std::pair<unsigned, const TargetRegisterClass *> R = + TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + + // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers + // (which we call X[0-9]+). If a 64-bit value has been requested, and a + // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent + // register. + // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use + // the AsmName field from *RegisterInfo.td, then this would not be necessary. + if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && + PPC::GPRCRegClass.contains(R.first)) + return std::make_pair(TRI->getMatchingSuperReg(R.first, + PPC::sub_32, &PPC::G8RCRegClass), + &PPC::G8RCRegClass); + + // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. + if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { + R.first = PPC::CR0; + R.second = &PPC::CRRCRegClass; + } + + return R; +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Only support length 1 constraints. + if (Constraint.length() > 1) return; + + char Letter = Constraint[0]; + switch (Letter) { + default: break; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': { + ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); + if (!CST) return; // Must be an immediate to match. + SDLoc dl(Op); + int64_t Value = CST->getSExtValue(); + EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative + // numbers are printed as such. + switch (Letter) { + default: llvm_unreachable("Unknown constraint letter!"); + case 'I': // "I" is a signed 16-bit constant. + if (isInt<16>(Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'J': // "J" is a constant with only the high-order 16 bits nonzero. + if (isShiftedUInt<16, 16>(Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. + if (isShiftedInt<16, 16>(Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'K': // "K" is a constant with only the low-order 16 bits nonzero. + if (isUInt<16>(Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'M': // "M" is a constant that is greater than 31. + if (Value > 31) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'N': // "N" is a positive constant that is an exact power of two. + if (Value > 0 && isPowerOf2_64(Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'O': // "O" is the constant zero. + if (Value == 0) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + case 'P': // "P" is a constant whose negation is a signed 16-bit constant. + if (isInt<16>(-Value)) + Result = DAG.getTargetConstant(Value, dl, TCVT); + break; + } + break; + } + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + + // Handle standard constraint letters. + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // PPC does not allow r+i addressing modes for vectors! + if (Ty->isVectorTy() && AM.BaseOffs != 0) + return false; + + // PPC allows a sign-extended 16-bit immediate field. + if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) + return false; + + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // PPC only support r+r, + switch (AM.Scale) { + case 0: // "r+i" or just "i", depending on HasBaseReg. + break; + case 1: + if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + return false; + // Otherwise we have r+r or r+i. + break; + case 2: + if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + return false; + // Allow 2*r as r+r. + break; + default: + // No other scales are supported. + return false; + } + + return true; +} + +SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + // Make sure the function does not optimize away the store of the RA to + // the stack. + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setLRStoreRequired(); + bool isPPC64 = Subtarget.isPPC64(); + auto PtrVT = getPointerTy(MF.getDataLayout()); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = + DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, + isPPC64 ? MVT::i64 : MVT::i32); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Just load the return address off the stack. + SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, + MachinePointerInfo(), false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + + // Naked functions never have a frame pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned FrameReg; + if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + FrameReg = isPPC64 ? PPC::X1 : PPC::R1; + else + FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; + + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, + PtrVT); + while (Depth--) + FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), + FrameAddr, MachinePointerInfo(), false, false, + false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + + if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || + (!isPPC64 && VT != MVT::i32)) + report_fatal_error("Invalid register global variable type"); + + bool is64Bit = isPPC64 && VT == MVT::i64; + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("r1", is64Bit ? PPC::X1 : PPC::R1) + .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) + .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : + (is64Bit ? PPC::X13 : PPC::R13)) + .Default(0); + + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + +bool +PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The PowerPC target isn't yet aware of offsets. + return false; +} + +bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvlfd: + case Intrinsic::ppc_qpx_qvlfs: + case Intrinsic::ppc_qpx_qvlfcd: + case Intrinsic::ppc_qpx_qvlfcs: + case Intrinsic::ppc_qpx_qvlfiwa: + case Intrinsic::ppc_qpx_qvlfiwz: + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_altivec_lvebx: + case Intrinsic::ppc_altivec_lvehx: + case Intrinsic::ppc_altivec_lvewx: + case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvw4x: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_lvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_lvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_lvewx: + VT = MVT::i32; + break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfd: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfs: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcd: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcs: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::ppc_qpx_qvlfda: + case Intrinsic::ppc_qpx_qvlfsa: + case Intrinsic::ppc_qpx_qvlfcda: + case Intrinsic::ppc_qpx_qvlfcsa: + case Intrinsic::ppc_qpx_qvlfiwaa: + case Intrinsic::ppc_qpx_qvlfiwza: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvlfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcsa: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.size = VT.getStoreSize(); + Info.align = 1; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::ppc_qpx_qvstfd: + case Intrinsic::ppc_qpx_qvstfs: + case Intrinsic::ppc_qpx_qvstfcd: + case Intrinsic::ppc_qpx_qvstfcs: + case Intrinsic::ppc_qpx_qvstfiw: + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_altivec_stvebx: + case Intrinsic::ppc_altivec_stvehx: + case Intrinsic::ppc_altivec_stvewx: + case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvw4x: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_altivec_stvebx: + VT = MVT::i8; + break; + case Intrinsic::ppc_altivec_stvehx: + VT = MVT::i16; + break; + case Intrinsic::ppc_altivec_stvewx: + VT = MVT::i32; + break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfd: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfs: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcd: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcs: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(1); + Info.offset = -VT.getStoreSize()+1; + Info.size = 2*VT.getStoreSize()-1; + Info.align = 1; + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::ppc_qpx_qvstfda: + case Intrinsic::ppc_qpx_qvstfsa: + case Intrinsic::ppc_qpx_qvstfcda: + case Intrinsic::ppc_qpx_qvstfcsa: + case Intrinsic::ppc_qpx_qvstfiwa: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvstfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcsa: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.size = VT.getStoreSize(); + Info.align = 1; + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} + +/// getOptimalMemOpType - Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { + const Function *F = MF.getFunction(); + // When expanding a memset, require at least two QPX instructions to cover + // the cost of loading the value to be stored from the constant pool. + if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && + (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + return MVT::v4f64; + } + + // We should use Altivec/VSX loads and stores when available. For unaligned + // addresses, unaligned VSX loads are only fast starting with the P8. + if (Subtarget.hasAltivec() && Size >= 16 && + (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || + ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) + return MVT::v4i32; + } + + if (Subtarget.isPPC64()) { + return MVT::i64; + } + + return MVT::i32; +} + +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + return !(BitSize == 0 || BitSize > 64); +} + +bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 == 64 && NumBits2 == 32; +} + +bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 == 64 && NumBits2 == 32; +} + +bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + // Generally speaking, zexts are not free, but they are free when they can be + // folded with other operations. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { + EVT MemVT = LD->getMemoryVT(); + if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || + (Subtarget.isPPC64() && MemVT == MVT::i32)) && + (LD->getExtensionType() == ISD::NON_EXTLOAD || + LD->getExtensionType() == ISD::ZEXTLOAD)) + return true; + } + + // FIXME: Add other cases... + // - 32-bit shifts with a zext to i64 + // - zext after ctlz, bswap, etc. + // - zext after and by a constant mask + + return TargetLowering::isZExtFree(Val, VT2); +} + +bool PPCTargetLowering::isFPExtFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return true; +} + +bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return isInt<16>(Imm) || isUInt<16>(Imm); +} + +bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { + return isInt<16>(Imm) || isUInt<16>(Imm); +} + +bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { + if (DisablePPCUnaligned) + return false; + + // PowerPC supports unaligned memory access for simple non-vector types. + // Although accessing unaligned addresses is not as efficient as accessing + // aligned addresses, it is generally more efficient than manual expansion, + // and generally only traps for software emulation when crossing page + // boundaries. + + if (!VT.isSimple()) + return false; + + if (VT.getSimpleVT().isVector()) { + if (Subtarget.hasVSX()) { + if (VT != MVT::v2f64 && VT != MVT::v2i64 && + VT != MVT::v4f32 && VT != MVT::v4i32) + return false; + } else { + return false; + } + } + + if (VT == MVT::ppcf128) + return false; + + if (Fast) + *Fast = true; + + return true; +} + +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +const MCPhysReg * +PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { + // LR is a callee-save register, but we must treat it as clobbered by any call + // site. Hence we include LR in the scratch registers, which are in turn added + // as implicit-defs for stackmaps and patchpoints. The same reasoning applies + // to CTR, which is used by any indirect call. + static const MCPhysReg ScratchRegs[] = { + PPC::X12, PPC::LR8, PPC::CTR8, 0 + }; + + return ScratchRegs; +} + +unsigned PPCTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; +} + +unsigned PPCTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; +} + +bool +PPCTargetLowering::shouldExpandBuildVectorWithShuffles( + EVT VT , unsigned DefinedValues) const { + if (VT == MVT::v2i64) + return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves + + if (Subtarget.hasQPX()) { + if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1) + return true; + } + + return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); +} + +Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { + if (DisableILPPref || Subtarget.enableMachineScheduler()) + return TargetLowering::getSchedulingPreference(N); + + return Sched::ILP; +} + +// Create a fast isel object. +FastISel * +PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const { + return PPC::createFastISel(FuncInfo, LibInfo); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h new file mode 100644 index 0000000..44bcb89 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -0,0 +1,902 @@ +//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that PPC uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H +#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H + +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCRegisterInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + namespace PPCISD { + enum NodeType : unsigned { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// FSEL - Traditional three-operand fsel node. + /// + FSEL, + + /// FCFID - The FCFID instruction, taking an f64 operand and producing + /// and f64 value containing the FP representation of the integer that + /// was temporarily in the f64 operand. + FCFID, + + /// Newer FCFID[US] integer-to-floating-point conversion instructions for + /// unsigned integers and single-precision outputs. + FCFIDU, FCFIDS, FCFIDUS, + + /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 + /// operand, producing an f64 value containing the integer representation + /// of that FP value. + FCTIDZ, FCTIWZ, + + /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for + /// unsigned integers. + FCTIDUZ, FCTIWUZ, + + /// Reciprocal estimate instructions (unary FP ops). + FRE, FRSQRTE, + + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking + // three v4f32 operands and producing a v4f32 result. + VMADDFP, VNMSUBFP, + + /// VPERM - The PPC VPERM Instruction. + /// + VPERM, + + /// The CMPB instruction (takes two operands of i32 or i64). + CMPB, + + /// Hi/Lo - These represent the high and low 16-bit parts of a global + /// address respectively. These nodes have two operands, the first of + /// which must be a TargetGlobalAddress, and the second of which must be a + /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C', + /// though these are usually folded into other nodes. + Hi, Lo, + + /// The following two target-specific nodes are used for calls through + /// function pointers in the 64-bit SVR4 ABI. + + /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an allocation on the stack. + DYNALLOC, + + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// These nodes represent the 32-bit PPC shifts that operate on 6-bit + /// shift amounts. These nodes are generated by the multi-precision shift + /// code. + SRL, SRA, SHL, + + /// The combination of sra[wd]i and addze used to implemented signed + /// integer division by a power of 2. The first operand is the dividend, + /// and the second is the constant shift amount (representing the + /// divisor). + SRA_ADDZE, + + /// CALL - A direct function call. + /// CALL_NOP is a call with the special NOP which follows 64-bit + /// SVR4 calls. + CALL, CALL_NOP, + + /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a + /// MTCTR instruction. + MTCTR, + + /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a + /// BCTRL instruction. + BCTRL, + + /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl + /// instruction and the TOC reload required on SVR4 PPC64. + BCTRL_LOAD_TOC, + + /// Return with a flag operand, matched by 'blr' + RET_FLAG, + + /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction. + /// This copies the bits corresponding to the specified CRREG into the + /// resultant GPR. Bits corresponding to other CR regs are undefined. + MFOCRF, + + /// Direct move from a VSX register to a GPR + MFVSR, + + /// Direct move from a GPR to a VSX register (algebraic) + MTVSRA, + + /// Direct move from a GPR to a VSX register (zero) + MTVSRZ, + + // FIXME: Remove these once the ANDI glue bug is fixed: + /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the + /// eq or gt bit of CR0 after executing andi. x, 1. This is used to + /// implement truncation of i32 or i64 to i1. + ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT, + + // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit + // target (returns (Lo, Hi)). It takes a chain operand. + READ_TIME_BASE, + + // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP* + /// instructions. For lack of better number, we use the opcode number + /// encoding for the OPC field to identify the compare. For example, 838 + /// is VCMPGTSH. + VCMP, + + /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the + /// altivec VCMP*o instructions. For lack of better number, we use the + /// opcode number encoding for the OPC field to identify the compare. For + /// example, 838 is VCMPGTSH. + VCMPo, + + /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This + /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the + /// condition register to branch on, OPC is the branch opcode to use (e.g. + /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is + /// an optional input flag argument. + COND_BRANCH, + + /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based + /// loops. + BDNZ, BDZ, + + /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding + /// towards zero. Used only as part of the long double-to-int + /// conversion sequence. + FADDRTZ, + + /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register. + MFFS, + + /// TC_RETURN - A tail call return. + /// operand #0 chain + /// operand #1 callee (register or absolute) + /// operand #2 stack adjustment + /// operand #3 optional in flag + TC_RETURN, + + /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls + CR6SET, + CR6UNSET, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS + /// on PPC32. + PPC32_GOT, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and + /// local dynamic TLS on PPC32. + PPC32_PICGOT, + + /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec + /// TLS model, produces an ADDIS8 instruction that adds the GOT + /// base to sym\@got\@tprel\@ha. + ADDIS_GOT_TPREL_HA, + + /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec + /// TLS model, produces a LD instruction with base register G8RReg + /// and offset sym\@got\@tprel\@l. This completes the addition that + /// finds the offset of "sym" relative to the thread pointer. + LD_GOT_TPREL_L, + + /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS + /// model, produces an ADD instruction that adds the contents of + /// G8RReg to the thread pointer. Symbol contains a relocation + /// sym\@tls which is to be replaced by the thread pointer and + /// identifies to the linker that the instruction is part of a + /// TLS sequence. + ADD_TLS, + + /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsgd\@ha. + ADDIS_TLSGD_HA, + + /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by + /// ADDIS_TLSGD_L_ADDR until after register assignment. + ADDI_TLSGD_L, + + /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by + /// ADDIS_TLSGD_L_ADDR until after register assignment. + GET_TLS_ADDR, + + /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that + /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following + /// register assignment. + ADDI_TLSGD_L_ADDR, + + /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsld\@ha. + ADDIS_TLSLD_HA, + + /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsld\@l and stores the result in X3. Hidden by + /// ADDIS_TLSLD_L_ADDR until after register assignment. + ADDI_TLSLD_L, + + /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by + /// ADDIS_TLSLD_L_ADDR until after register assignment. + GET_TLSLD_ADDR, + + /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that + /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion + /// following register assignment. + ADDI_TLSLD_L_ADDR, + + /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS + /// model, produces an ADDIS8 instruction that adds X3 to + /// sym\@dtprel\@ha. + ADDIS_DTPREL_HA, + + /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@dtprel\@l. + ADDI_DTPREL_L, + + /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded + /// during instruction selection to optimize a BUILD_VECTOR into + /// operations on splats. This is necessary to avoid losing these + /// optimizations due to constant folding. + VADD_SPLAT, + + /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned + /// operand identifies the operating system entry point. + SC, + + /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer. + CLRBHRB, + + /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch + /// history rolling buffer entry. + MFBHRBE, + + /// CHAIN = RFEBB CHAIN, State - Return from event-based branch. + RFEBB, + + /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little + /// endian. Maps to an xxswapd instruction that corrects an lxvd2x + /// or stxvd2x instruction. The chain is necessary because the + /// sequence replaces a load and needs to provide the same number + /// of outputs. + XXSWAPD, + + /// QVFPERM = This corresponds to the QPX qvfperm instruction. + QVFPERM, + + /// QVGPCI = This corresponds to the QPX qvgpci instruction. + QVGPCI, + + /// QVALIGNI = This corresponds to the QPX qvaligni instruction. + QVALIGNI, + + /// QVESPLATI = This corresponds to the QPX qvesplati instruction. + QVESPLATI, + + /// QBFLT = Access the underlying QPX floating-point boolean + /// representation. + QBFLT, + + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a + /// byte-swapping store instruction. It byte-swaps the low "Type" bits of + /// the GPRC input, then stores it through Ptr. Type can be either i16 or + /// i32. + STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE, + + /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a + /// byte-swapping load instruction. It loads "Type" bits, byte swaps it, + /// then puts it in the bottom bits of the GPRC. TYPE can be either i16 + /// or i32. + LBRX, + + /// STFIWX - The STFIWX instruction. The first operand is an input token + /// chain, then an f64 value to store, then an address to store it to. + STFIWX, + + /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point + /// load which sign-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWAX, + + /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point + /// load which zero-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWZX, + + /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to an lxvd2x instruction that will be followed by + /// an xxswapd. + LXVD2X, + + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to an stxvd2x instruction that will be preceded by + /// an xxswapd. + STXVD2X, + + /// QBRC, CHAIN = QVLFSb CHAIN, Ptr + /// The 4xf32 load used for v4i1 constants. + QVLFSb, + + /// GPRC = TOC_ENTRY GA, TOC + /// Loads the entry for GA from the TOC, where the TOC base is given by + /// the last operand. + TOC_ENTRY + }; + } + + /// Define some predicates that are used for node matching. + namespace PPC { + /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUHUM instruction. + bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUWUM instruction. + bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUDUM instruction. + bool isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG); + + /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG); + + /// isVMRGEOShuffleMask - Return true if this is a shuffle mask suitable for + /// a VMRGEW or VMRGOW instruction + bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, + unsigned ShuffleKind, SelectionDAG &DAG); + + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the + /// shift amount, otherwise return -1. + int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a splat of a single element that is suitable for input to + /// VSPLTB/VSPLTH/VSPLTW. + bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize); + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the + /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. + unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); + + /// get_VSPLTI_elt - If this is a build_vector of constants which can be + /// formed by using a vspltis[bhw] instruction of the specified element + /// size, return the constant being splatted. The ByteSize field indicates + /// the number of bytes of each element [124] -> [bhw]. + SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); + + /// If this is a qvaligni shuffle mask, return the shift + /// amount, otherwise return -1. + int isQVALIGNIShuffleMask(SDNode *N); + } + + class PPCTargetLowering : public TargetLowering { + const PPCSubtarget &Subtarget; + + public: + explicit PPCTargetLowering(const PPCTargetMachine &TM, + const PPCSubtarget &STI); + + /// getTargetNodeName() - This method returns the name of a target specific + /// DAG node. + const char *getTargetNodeName(unsigned Opcode) const override; + + bool useSoftFloat() const override; + + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i32; + } + + bool isCheapToSpeculateCttz() const override { + return true; + } + + bool isCheapToSpeculateCtlz() const override { + return true; + } + + /// getSetCCResultType - Return the ISD::SETCC ValueType + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + + /// Return true if target always beneficiates from combining into FMA for a + /// given value type. This must typically return false on targets where FMA + /// takes more cycles to execute than FADD. + bool enableAggressiveFMAFusion(EVT VT) const override; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + /// SelectAddressRegReg - Given the specified addressed, check to see if it + /// can be represented as an indexed [r+r] operation. Returns false if it + /// can be more efficiently represented with [r+imm]. + bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, + SelectionDAG &DAG) const; + + /// SelectAddressRegImm - Returns true if the address N can be represented + /// by a base register plus a signed 16-bit displacement [r+imm], and if it + /// is not better represented as reg+reg. If Aligned is true, only accept + /// displacements suitable for STD and friends, i.e. multiples of 4. + bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, + SelectionDAG &DAG, bool Aligned) const; + + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index, + SelectionDAG &DAG) const; + + Sched::Preference getSchedulingPreference(SDNode *N) const override; + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const override; + + SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + std::vector<SDNode *> *Created) const override; + + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned AtomicSize, + unsigned BinOpcode) const; + MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, + MachineBasicBlock *MBB, + bool is8bit, unsigned Opcode) const; + + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + ConstraintType getConstraintType(StringRef Constraint) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const override; + + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + + /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. This is the actual + /// alignment, not its logarithm. + unsigned getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const override; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. + void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "es") + return InlineAsm::Constraint_es; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode == "Q") + return InlineAsm::Constraint_Q; + else if (ConstraintCode == "Z") + return InlineAsm::Constraint_Z; + else if (ConstraintCode == "Zy") + return InlineAsm::Constraint_Zy; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + bool isLegalICmpImmediate(int64_t Imm) const override; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + bool isLegalAddImmediate(int64_t Imm) const override; + + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in + /// register X1 to i32 by referencing its sub-register R1. + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool isFPExtFree(EVT VT) const override; + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + + bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const override; + + /// getOptimalMemOpType - Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove + /// lowering. If DstAlign is zero that means it's safe to destination + /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it + /// means there isn't a need to check it against alignment requirement, + /// probably because the source does not need to be loaded. If 'IsMemset' is + /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that + /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy + /// source is constant so it does not need to be loaded. + /// It returns EVT::Other if the type should be determined using generic + /// target-independent logic. + EVT + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const override; + + /// Is unaligned memory access allowed for the given type, and is it fast + /// relative to software emulation. + bool allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align = 1, + bool *Fast = nullptr) const override; + + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + + // Should we expand the build vector with shuffles? + bool + shouldExpandBuildVectorWithShuffles(EVT VT, + unsigned DefinedValues) const override; + + /// createFastISel - This method returns a target-specific FastISel object, + /// or null if the target does not support "fast" instruction selection. + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const override; + + /// \brief Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { + // We support any array type as "consecutive" block in the parameter + // save area. The element type defines the alignment requirement and + // whether the argument should go in GPRs, FPRs, or VRs if available. + // + // Note that clang uses this capability both to implement the ELFv2 + // homogeneous float/vector aggregate ABI, and to avoid having to use + // "byval" when passing aggregates that might fully fit in registers. + return Ty->isArrayTy(); + } + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + + private: + struct ReuseLoadInfo { + SDValue Ptr; + SDValue Chain; + SDValue ResChain; + MachinePointerInfo MPI; + bool IsInvariant; + unsigned Alignment; + AAMDNodes AAInfo; + const MDNode *Ranges; + + ReuseLoadInfo() : IsInvariant(false), Alignment(0), Ranges(nullptr) {} + }; + + bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + ISD::LoadExtType ET = ISD::NON_EXTLOAD) const; + void spliceIntoChain(SDValue ResChain, SDValue NewResChain, + SelectionDAG &DAG) const; + + void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, SDLoc dl) const; + SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const; + SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const; + + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; + SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; + + bool + IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + + SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, + int SPDiff, + SDValue Chain, + SDValue &LROpOut, + SDValue &FPOpOut, + bool isDarwinABI, + SDLoc dl) const; + + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall, + bool isVarArg, bool IsPatchPoint, bool hasNest, + SelectionDAG &DAG, + SmallVector<std::pair<unsigned, SDValue>, 8> + &RegsToPass, + SDValue InFlag, SDValue Chain, SDValue CallSeqStart, + SDValue &Callee, + int SPDiff, unsigned NumBytes, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const; + + SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue + LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + bool + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const override; + + SDValue + extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, + SDValue ArgVal, SDLoc dl) const; + + SDValue + LowerFormalArguments_Darwin(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerFormalArguments_64SVR4(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerFormalArguments_32SVR4(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + SDValue + createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, + SDValue CallSeqStart, ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, SDLoc dl) const; + + SDValue + LowerCall_Darwin(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const; + SDValue + LowerCall_64SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const; + SDValue + LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, bool IsPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite *CS) const; + + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + + SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + unsigned combineRepeatedFPDivisors() const override; + + CCAssignFn *useFastISelCCs(unsigned Flag) const; + }; + + namespace PPC { + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo); + } + + bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); + + bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); + + bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +} + +#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td new file mode 100644 index 0000000..075e093 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -0,0 +1,1235 @@ +//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC 64-bit instructions. These patterns are used +// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 64-bit operands. +// +def s16imm64 : Operand<i64> { + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS16ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def u16imm64 : Operand<i64> { + let PrintMethod = "printU16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCU16ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<16>"; +} +def s17imm64 : Operand<i64> { + // This operand type is used for addis/lis to allow the assembler parser + // to accept immediates in the range -65536..65535 for compatibility with + // the GNU assembler. The operand is treated as 16-bit otherwise. + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS17ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def tocentry : Operand<iPTR> { + let MIOperandInfo = (ops i64imm:$imm); +} +def tlsreg : Operand<i64> { + let EncoderMethod = "getTLSRegEncoding"; + let ParserMatchClass = PPCTLSRegOperand; +} +def tlsgd : Operand<i64> {} +def tlscall : Operand<i64> { + let PrintMethod = "printTLSCall"; + let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym); + let EncoderMethod = "getTLSCallEncoding"; +} + +//===----------------------------------------------------------------------===// +// 64-bit transformation functions. +// + +def SHL64 : SDNodeXForm<imm, [{ + // Transformation function: 63 - imm + return getI32Imm(63 - N->getZExtValue(), SDLoc(N)); +}]>; + +def SRL64 : SDNodeXForm<imm, [{ + // Transformation function: 64 - imm + return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue(), SDLoc(N)) + : getI32Imm(0, SDLoc(N)); +}]>; + +def HI32_48 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getZExtValue() >> 32, SDLoc(N))); +}]>; + +def HI48_64 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getZExtValue() >> 48, SDLoc(N))); +}]>; + + +//===----------------------------------------------------------------------===// +// Calls. +// + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { + let isReturn = 1, Uses = [LR8, RM] in + def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, + [(retflag)]>, Requires<[In64BitMode]>; + let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in { + def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, + Requires<[In64BitMode]>; + def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), + "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, + []>, + Requires<[In64BitMode]>; + + def BCCTR8 : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi), + "bcctr 12, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi), + "bcctr 4, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + } +} + +let Defs = [LR8] in + def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>, + PPC970_Unit_BRU; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { + let Defs = [CTR8], Uses = [CTR8] in { + def BDZ8 : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst">; + def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst">; + } + + let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in { + def BDZLR8 : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins), + "bdzlr", IIC_BrB, []>; + def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins), + "bdnzlr", IIC_BrB, []>; + } +} + + + +let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL8 : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + + def BL8_TLS : IForm<18, 0, 1, (outs), (ins tlscall:$func), + "bl $func", IIC_BrB, []>; + + def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>; + } + let Uses = [RM], isCodeGenOnly = 1 in { + def BL8_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins tlscall:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24, + (outs), (ins abscalltarget:$func), + "bla $func\n\tnop", IIC_BrB, + [(PPCcall_nop (i64 imm:$func))]>; + } + let Uses = [CTR8, RM] in { + def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In64BitMode]>; + + let isCodeGenOnly = 1 in { + def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), + "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB, + []>, + Requires<[In64BitMode]>; + + def BCCTRL8 : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi), + "bcctrl 12, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi), + "bcctrl 4, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + } + } +} + +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in { + def BCTRL8_LDinto_toc : + XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs), + (ins memrix:$src), + "bctrl\n\tld 2, $src", IIC_BrB, + [(PPCbctrl_load_toc ixaddr:$src)]>, + Requires<[In64BitMode]>; +} + +} // Interpretation64Bit + +// FIXME: Duplicating this for the asm parser should be unnecessary, but the +// previous definition must be marked as CodeGen only to prevent decoding +// conflicts. +let Interpretation64Bit = 1, isAsmParserOnly = 1 in +let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in +def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func), + "bl $func", IIC_BrB, []>; + +// Calls +def : Pat<(PPCcall (i64 tglobaladdr:$dst)), + (BL8 tglobaladdr:$dst)>; +def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)), + (BL8_NOP tglobaladdr:$dst)>; + +def : Pat<(PPCcall (i64 texternalsym:$dst)), + (BL8 texternalsym:$dst)>; +def : Pat<(PPCcall_nop (i64 texternalsym:$dst)), + (BL8_NOP texternalsym:$dst)>; + +// Atomic operations +let usesCustomInserter = 1 in { + let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64", + [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_SUB_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64", + [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_OR_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64", + [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_XOR_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64", + [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_AND_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64", + [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_NAND_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64", + [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>; + + def ATOMIC_CMP_SWAP_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64", + [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>; + + def ATOMIC_SWAP_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64", + [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>; + } +} + +// Instructions to support atomic operations +let mayLoad = 1, hasSideEffects = 0 in { +def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), + "ldarx $rD, $ptr", IIC_LdStLDARX, []>; + +// Instruction to support lock versions of atomics +// (EH=1 - see Power ISA 2.07 Book II 4.4.2) +def LDARXL : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), + "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT; +} + +let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in +def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), + "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNdi8 :Pseudo< (outs), + (ins calltarget:$dst, i32imm:$offset), + "#TC_RETURNd8 $dst $offset", + []>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), + "#TC_RETURNa8 $func $offset", + [(PPCtc_return (i64 imm:$func), imm:$offset)]>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset), + "#TC_RETURNr8 $dst $offset", + []>; + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, + isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in +def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, + Requires<[In64BitMode]>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILB8 : IForm<18, 0, 0, (outs), (ins calltarget:$dst), + "b $dst", IIC_BrB, + []>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILBA8 : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst), + "ba $dst", IIC_BrB, + []>; +} // Interpretation64Bit + +def : Pat<(PPCtc_return (i64 tglobaladdr:$dst), imm:$imm), + (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm), + (TCRETURNdi8 texternalsym:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm), + (TCRETURNri8 CTRRC8:$dst, imm:$imm)>; + + +// 64-bit CR instructions +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 0 in { +def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST), + "mtocrf $FXM, $ST", IIC_BrMCRX>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS), + "mtcrf $FXM, $rS", IIC_BrMCRX>, + PPC970_MicroCode, PPC970_Unit_CRU; + +let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM), + "mfocrf $rT, $FXM", IIC_SprMFCRF>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins), + "mfcr $rT", IIC_SprMFCR>, + PPC970_MicroCode, PPC970_Unit_CRU; +} // hasSideEffects = 0 + +let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + let Defs = [CTR8] in + def EH_SjLj_SetJmp64 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + "#EH_SJLJ_SETJMP64", + [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in + def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf), + "#EH_SJLJ_LONGJMP64", + [(PPCeh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; +} + +def MFSPR8 : XFXForm_1<31, 339, (outs g8rc:$RT), (ins i32imm:$SPR), + "mfspr $RT, $SPR", IIC_SprMFSPR>; +def MTSPR8 : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, g8rc:$RT), + "mtspr $SPR, $RT", IIC_SprMTSPR>; + + +//===----------------------------------------------------------------------===// +// 64-bit SPR manipulation instrs. + +let Uses = [CTR8] in { +def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins), + "mfctr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in { +def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let hasSideEffects = 1, Defs = [CTR8] in { +let Pattern = [(int_ppc_mtctr i64:$rS)] in +def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let Pattern = [(set i64:$rT, readcyclecounter)] in +def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins), + "mfspr $rT, 268", IIC_SprMFTB>, + PPC970_DGroup_First, PPC970_Unit_FXU; +// Note that encoding mftb using mfspr is now the preferred form, +// and has been since at least ISA v2.03. The mftb instruction has +// now been phased out. Using mfspr, however, is known not to work on +// the POWER3. + +let Defs = [X1], Uses = [X1] in +def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", + [(set i64:$result, + (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", + [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; + +let Defs = [LR8] in { +def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), + "mtlr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Uses = [LR8] in { +def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins), + "mflr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +} // Interpretation64Bit + +//===----------------------------------------------------------------------===// +// Fixed point instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. +let Interpretation64Bit = 1 in { +let hasSideEffects = 0 in { +let isCodeGenOnly = 1 in { + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { +def LI8 : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm), + "li $rD, $imm", IIC_IntSimple, + [(set i64:$rD, imm64SExt16:$imm)]>; +def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm), + "lis $rD, $imm", IIC_IntSimple, + [(set i64:$rD, imm16ShiftedSExt:$imm)]>; +} + +// Logical ops. +let isCommutable = 1 in { +defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "nand", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>; +defm AND8 : XForm_6r<31, 28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "and", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (and i64:$rS, i64:$rB))]>; +} // isCommutable +defm ANDC8: XForm_6r<31, 60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "andc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>; +let isCommutable = 1 in { +defm OR8 : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "or", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (or i64:$rS, i64:$rB))]>; +defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "nor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>; +} // isCommutable +defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "orc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>; +let isCommutable = 1 in { +defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "eqv", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>; +defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "xor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (xor i64:$rS, i64:$rB))]>; +} // let isCommutable = 1 + +// Logical ops with immediate. +let Defs = [CR0] in { +def ANDIo8 : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "andi. $dst, $src1, $src2", IIC_IntGeneral, + [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "andis. $dst, $src1, $src2", IIC_IntGeneral, + [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>, + isDOT; +} +def ORI8 : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "ori $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>; +def ORIS8 : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "oris $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>; +def XORI8 : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "xori $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>; +def XORIS8 : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "xoris $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>; + +let isCommutable = 1 in +defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i64:$rT, (add i64:$rA, i64:$rB))]>; +// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the +// initial-exec thread-local storage model. +def ADD8TLS : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, + [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>; + +let isCommutable = 1 in +defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "addc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (addc i64:$rA, i64:$rB))]>, + PPC970_DGroup_Cracked; + +let Defs = [CARRY] in +def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "addic $rD, $rA, $imm", IIC_IntGeneral, + [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>; +def ADDI8 : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm), + "addi $rD, $rA, $imm", IIC_IntSimple, + [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>; +def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm), + "addis $rD, $rA, $imm", IIC_IntSimple, + [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>; + +let Defs = [CARRY] in { +def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "subfic $rD, $rA, $imm", IIC_IntGeneral, + [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>; +defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subfc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (subc i64:$rB, i64:$rA))]>, + PPC970_DGroup_Cracked; +} +defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (sub i64:$rB, i64:$rA))]>; +defm NEG8 : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "neg", "$rT, $rA", IIC_IntSimple, + [(set i64:$rT, (ineg i64:$rA))]>; +let Uses = [CARRY] in { +let isCommutable = 1 in +defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "adde", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, i64:$rB))]>; +defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "addme", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, -1))]>; +defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "addze", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, 0))]>; +defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subfe", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (sube i64:$rB, i64:$rA))]>; +defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "subfme", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (sube -1, i64:$rA))]>; +defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "subfze", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (sube 0, i64:$rA))]>; +} +} // isCodeGenOnly + +// FIXME: Duplicating this for the asm parser should be unnecessary, but the +// previous definition must be marked as CodeGen only to prevent decoding +// conflicts. +let isAsmParserOnly = 1 in +def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, []>; + +let isCommutable = 1 in { +defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulhd", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>; +defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU, + [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>; +} // isCommutable +} +} // Interpretation64Bit + +let isCompare = 1, hasSideEffects = 0 in { + def CMPD : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB), + "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64; + def CMPLD : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB), + "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64; + def CMPDI : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm), + "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64; + def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "cmpldi $dst, $src1, $src2", + IIC_IntCompare>, isPPC64; +} + +let hasSideEffects = 0 in { +defm SLD : XForm_6r<31, 27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "sld", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64; +defm SRD : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "srd", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64; +defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "srad", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +defm CNTLZW8 : XForm_11r<31, 26, (outs g8rc:$rA), (ins g8rc:$rS), + "cntlzw", "$rA, $rS", IIC_IntGeneral, []>; + +defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS), + "extsb", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i8))]>; +defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS), + "extsh", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i16))]>; + +defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>; +defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>; +} // Interpretation64Bit + +// For fast-isel: +let isCodeGenOnly = 1 in { +def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS), + "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64; +def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS), + "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64; +} // isCodeGenOnly for fast-isel + +defm EXTSW : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS), + "extsw", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS), + "extsw", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext i32:$rS))]>, isPPC64; + +defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), + "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, + [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; +defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS), + "cntlzd", "$rA, $rS", IIC_IntGeneral, + [(set i64:$rA, (ctlz i64:$rS))]>; +def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS), + "popcntd $rA, $rS", IIC_IntGeneral, + [(set i64:$rA, (ctpop i64:$rS))]>; +def BPERMD : XForm_6<31, 252, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "bpermd $rA, $rS, $rB", IIC_IntGeneral, + [(set i64:$rA, (int_ppc_bpermd g8rc:$rS, g8rc:$rB))]>, + isPPC64, Requires<[HasBPERMD]>; + +let isCodeGenOnly = 1, isCommutable = 1 in +def CMPB8 : XForm_6<31, 508, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "cmpb $rA, $rS, $rB", IIC_IntGeneral, + [(set i64:$rA, (PPCcmpb i64:$rS, i64:$rB))]>; + +// popcntw also does a population count on the high 32 bits (storing the +// results in the high 32-bits of the output). We'll ignore that here (which is +// safe because we never separately use the high part of the 64-bit registers). +def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS), + "popcntw $rA, $rS", IIC_IntGeneral, + [(set i32:$rA, (ctpop i32:$rS))]>; + +defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divd", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64; +defm DIVDU : XOForm_1rcr<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divdu", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64; +def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divde $rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>, + isPPC64, Requires<[HasExtDiv]>; +let Defs = [CR0] in +def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divde. $rT, $rA, $rB", IIC_IntDivD, + []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, + isPPC64, Requires<[HasExtDiv]>; +def DIVDEU : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divdeu $rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>, + isPPC64, Requires<[HasExtDiv]>; +let Defs = [CR0] in +def DIVDEUo : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divdeu. $rT, $rA, $rB", IIC_IntDivD, + []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, + isPPC64, Requires<[HasExtDiv]>; +let isCommutable = 1 in +defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulld", "$rT, $rA, $rB", IIC_IntMulHD, + [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "mulli $rD, $rA, $imm", IIC_IntMulLI, + [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>; +} + +let hasSideEffects = 0 in { +defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA), + (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64, RegConstraint<"$rSi = $rA">, + NoEncode<"$rSi">; + +// Rotate instructions. +defm RLDCL : MDSForm_1r<30, 8, + (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE), + "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD, + []>, isPPC64; +defm RLDCR : MDSForm_1r<30, 9, + (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE), + "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD, + []>, isPPC64; +defm RLDICL : MDForm_1r<30, 0, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +// For fast-isel: +let isCodeGenOnly = 1 in +def RLDICL_32_64 : MDForm_1<30, 0, + (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +// End fast-isel. +defm RLDICR : MDForm_1r<30, 1, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +defm RLDIC : MDForm_1r<30, 2, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA), + (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>; + +defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA), + (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME), + "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral, + []>; + +// RLWIMI can be commuted if the rotate amount is zero. +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA), + (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB, + u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", + IIC_IntRotate, []>, PPC970_DGroup_Cracked, + RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; + +let isSelect = 1 in +def ISEL8 : AForm_4<31, 15, + (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond), + "isel $rT, $rA, $rB, $cond", IIC_IntISEL, + []>; +} // Interpretation64Bit +} // hasSideEffects = 0 +} // End FXU Operations. + + +//===----------------------------------------------------------------------===// +// Load/Store instructions. +// + + +// Sign extending loads. +let PPC970_Unit = 2 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src), + "lha $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src), + "lwa $rD, $src", IIC_LdStLWA, + [(set i64:$rD, + (aligned4sextloadi32 ixaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src), + "lhax $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src), + "lwax $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; +// For fast-isel: +let isCodeGenOnly = 1, mayLoad = 1 in { +def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src), + "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64, + PPC970_DGroup_Cracked; +def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src), + "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64, + PPC970_DGroup_Cracked; +} // end fast-isel isCodeGenOnly + +// Update forms. +let mayLoad = 1, hasSideEffects = 0 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memri:$addr), + "lhau $rD, $addr", IIC_LdStLHAU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +// NO LWAU! + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; +} +} + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +// Zero extending loads. +let PPC970_Unit = 2 in { +def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src), + "lbz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi8 iaddr:$src))]>; +def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src), + "lhz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src), + "lwz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64; + +def LBZX8 : XForm_1<31, 87, (outs g8rc:$rD), (ins memrr:$src), + "lbzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi8 xaddr:$src))]>; +def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src), + "lhzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX8 : XForm_1<31, 23, (outs g8rc:$rD), (ins memrr:$src), + "lwzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi32 xaddr:$src))]>; + + +// Update forms. +let mayLoad = 1, hasSideEffects = 0 in { +def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lbzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lwzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +} +} +} // Interpretation64Bit + + +// Full 8-byte loads. +let PPC970_Unit = 2 in { +def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), + "ld $rD, $src", IIC_LdStLD, + [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64; +// The following four definitions are selected for small code model only. +// Otherwise, we need to create two instructions to form a 32-bit offset, +// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select(). +def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtoc", + [(set i64:$rD, + (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; +def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtocJTI", + [(set i64:$rD, + (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64; +def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtocCPT", + [(set i64:$rD, + (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64; +def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtocCPT", + [(set i64:$rD, + (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64; + +def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src), + "ldx $rD, $src", IIC_LdStLD, + [(set i64:$rD, (load xaddr:$src))]>, isPPC64; +def LDBRX : XForm_1<31, 532, (outs g8rc:$rD), (ins memrr:$src), + "ldbrx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64; + +let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in { +def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src), + "lhbrx $rD, $src", IIC_LdStLoad, []>; +def LWBRX8 : XForm_1<31, 534, (outs g8rc:$rD), (ins memrr:$src), + "lwbrx $rD, $src", IIC_LdStLoad, []>; +} + +let mayLoad = 1, hasSideEffects = 0 in { +def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr), + "ldu $rD, $addr", IIC_LdStLDU, + []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, + NoEncode<"$ea_result">; + +def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "ldux $rD, $addr", IIC_LdStLDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; +} +} + +// Support for medium and large code model. +let hasSideEffects = 0 in { +def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDIStocHA", []>, isPPC64; +let mayLoad = 1 in +def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), + "#LDtocL", []>, isPPC64; +def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDItocL", []>, isPPC64; +} + +// Support for thread-local storage. +def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDISgotTprelHA", + [(set i64:$rD, + (PPCaddisGotTprelHA i64:$reg, + tglobaltlsaddr:$disp))]>, + isPPC64; +def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), + "#LDgotTprelL", + [(set i64:$rD, + (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, + isPPC64; +def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), + (ADD8TLS $in, tglobaltlsaddr:$g)>; +def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIStlsgdHA", + [(set i64:$rD, + (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDItlsgdL", + [(set i64:$rD, + (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +// LR8 is a true define, while the rest of the Defs are clobbers. X3 is +// explicitly defined when this op is created, so not mentioned here. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in +def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + "#GETtlsADDR", + [(set i64:$rD, + (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; +// Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8 +// are true defines while the rest of the Defs are clobbers. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] + in +def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD), + (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym), + "#ADDItlsgdLADDR", + [(set i64:$rD, + (PPCaddiTlsgdLAddr i64:$reg, + tglobaltlsaddr:$disp, + tglobaltlsaddr:$sym))]>, + isPPC64; +def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIStlsldHA", + [(set i64:$rD, + (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDItlsldL", + [(set i64:$rD, + (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +// LR8 is a true define, while the rest of the Defs are clobbers. X3 is +// explicitly defined when this op is created, so not mentioned here. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in +def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + "#GETtlsldADDR", + [(set i64:$rD, + (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; +// Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8 +// are true defines, while the rest of the Defs are clobbers. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] + in +def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD), + (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym), + "#ADDItlsldLADDR", + [(set i64:$rD, + (PPCaddiTlsldLAddr i64:$reg, + tglobaltlsaddr:$disp, + tglobaltlsaddr:$sym))]>, + isPPC64; +def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDISdtprelHA", + [(set i64:$rD, + (PPCaddisDtprelHA i64:$reg, + tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIdtprelL", + [(set i64:$rD, + (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; + +let PPC970_Unit = 2 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +// Truncating stores. +def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src), + "stb $rS, $src", IIC_LdStStore, + [(truncstorei8 i64:$rS, iaddr:$src)]>; +def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src), + "sth $rS, $src", IIC_LdStStore, + [(truncstorei16 i64:$rS, iaddr:$src)]>; +def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src), + "stw $rS, $src", IIC_LdStStore, + [(truncstorei32 i64:$rS, iaddr:$src)]>; +def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst), + "stbx $rS, $dst", IIC_LdStStore, + [(truncstorei8 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst), + "sthx $rS, $dst", IIC_LdStStore, + [(truncstorei16 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst), + "stwx $rS, $dst", IIC_LdStStore, + [(truncstorei32 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +} // Interpretation64Bit + +// Normal 8-byte stores. +def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst), + "std $rS, $dst", IIC_LdStSTD, + [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64; +def STDX : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst), + "stdx $rS, $dst", IIC_LdStSTD, + [(store i64:$rS, xaddr:$dst)]>, isPPC64, + PPC970_DGroup_Cracked; +def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst), + "stdbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64, + PPC970_DGroup_Cracked; +} + +// Stores with Update (pre-inc). +let PPC970_Unit = 2, mayStore = 1 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + +def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +} // Interpretation64Bit + +def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst), + "stdu $rS, $dst", IIC_LdStSTDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">, + isPPC64; + +def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stdux $rS, $dst", IIC_LdStSTDUX, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked, isPPC64; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STDU $rS, iaddroff:$ptroff, $ptrreg)>; + +def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STBUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STHUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STWUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STDUX $rS, $ptrreg, $ptroff)>; + + +//===----------------------------------------------------------------------===// +// Floating point instructions. +// + + +let PPC970_Unit = 3, hasSideEffects = 0, + Uses = [RM] in { // FPU Operations. +defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB), + "fcfid", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64; +defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB), + "fctid", "$frD, $frB", IIC_FPGeneral, + []>, isPPC64; +defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB), + "fctidz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64; + +defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB), + "fcfidu", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64; +defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB), + "fcfids", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64; +defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB), + "fcfidus", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64; +defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiduz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64; +defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwuz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64; +} + + +//===----------------------------------------------------------------------===// +// Instruction Patterns +// + +// Extensions and truncates to/from 32-bit regs. +def : Pat<(i64 (zext i32:$in)), + (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32), + 0, 32)>; +def : Pat<(i64 (anyext i32:$in)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>; +def : Pat<(i32 (trunc i64:$in)), + (EXTRACT_SUBREG $in, sub_32)>; + +// Implement the 'not' operation with the NOR instruction. +// (we could use the default xori pattern, but nor has lower latency on some +// cores (such as the A2)). +def i64not : OutPatFrag<(ops node:$in), + (NOR8 $in, $in)>; +def : Pat<(not i64:$in), + (i64not $in)>; + +// Extending loads with i64 targets. +def : Pat<(zextloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ8 iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX8 xaddr:$src)>; +def : Pat<(extloadi32 iaddr:$src), + (LWZ8 iaddr:$src)>; +def : Pat<(extloadi32 xaddr:$src), + (LWZX8 xaddr:$src)>; + +// Standard shifts. These are represented separately from the real shifts above +// so that we can distinguish between shifts that allow 6-bit and 7-bit shift +// amounts. +def : Pat<(sra i64:$rS, i32:$rB), + (SRAD $rS, $rB)>; +def : Pat<(srl i64:$rS, i32:$rB), + (SRD $rS, $rB)>; +def : Pat<(shl i64:$rS, i32:$rB), + (SLD $rS, $rB)>; + +// SHL/SRL +def : Pat<(shl i64:$in, (i32 imm:$imm)), + (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>; +def : Pat<(srl i64:$in, (i32 imm:$imm)), + (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>; + +// ROTL +def : Pat<(rotl i64:$in, i32:$sh), + (RLDCL $in, $sh, 0)>; +def : Pat<(rotl i64:$in, (i32 imm:$imm)), + (RLDICL $in, imm:$imm, 0)>; + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI8 tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in , 0), (LI8 tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>; +def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>; +def : Pat<(PPClo tblockaddress:$in, 0), (LI8 tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in), + (ADDIS8 $in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in), + (ADDI8 $in, tglobaltlsaddr:$g)>; +def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS8 $in, tglobaladdr:$g)>; +def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)), + (ADDIS8 $in, tconstpool:$g)>; +def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)), + (ADDIS8 $in, tjumptable:$g)>; +def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)), + (ADDIS8 $in, tblockaddress:$g)>; + +// Patterns to match r+r indexed loads and stores for +// addresses without at least 4-byte alignment. +def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)), + (LWAX xoaddr:$src)>; +def : Pat<(i64 (unaligned4load xoaddr:$src)), + (LDX xoaddr:$src)>; +def : Pat<(unaligned4store i64:$rS, xoaddr:$dst), + (STDX $rS, xoaddr:$dst)>; + +// 64-bits atomic loads and stores +def : Pat<(atomic_load_64 ixaddr:$src), (LD memrix:$src)>; +def : Pat<(atomic_load_64 xaddr:$src), (LDX memrr:$src)>; + +def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>; +def : Pat<(atomic_store_64 xaddr:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td new file mode 100644 index 0000000..cb0271f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -0,0 +1,1215 @@ +//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Altivec extension to the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +// *********************************** NOTE *********************************** +// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing ** +// ** which VMX and VSX instructions are lane-sensitive and which are not. ** +// ** A lane-sensitive instruction relies, implicitly or explicitly, on ** +// ** whether lanes are numbered from left to right. An instruction like ** +// ** VADDFP is not lane-sensitive, because each lane of the result vector ** +// ** relies only on the corresponding lane of the source vectors. However, ** +// ** an instruction like VMULESB is lane-sensitive, because "even" and ** +// ** "odd" lanes are different for big-endian and little-endian numbering. ** +// ** ** +// ** When adding new VMX and VSX instructions, please consider whether they ** +// ** are lane-sensitive. If so, they must be added to a switch statement ** +// ** in PPCVSXSwapRemoval::gatherVectorInstructions(). ** +// **************************************************************************** + +//===----------------------------------------------------------------------===// +// Altivec transformation functions and pattern fragments. +// + +// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be +// of that type. +def vnot_ppc : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; + +def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); +}]>; +def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); +}]>; +def vpkudum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); +}]>; +def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG); +}]>; +def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG); +}]>; +def vpkudum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG); +}]>; + +// These fragments are provided for little-endian, where the inputs must be +// swapped for correct semantics. +def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG); +}]>; +def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG); +}]>; +def vpkudum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG); +}]>; + +def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG); +}]>; +def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG); +}]>; +def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG); +}]>; +def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG); +}]>; +def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG); +}]>; +def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG); +}]>; + + +def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG); +}]>; +def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG); +}]>; +def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG); +}]>; +def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG); +}]>; +def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG); +}]>; +def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG); +}]>; + + +// These fragments are provided for little-endian, where the inputs must be +// swapped for correct semantics. +def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG); +}]>; +def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG); +}]>; +def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG); +}]>; +def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG); +}]>; +def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG); +}]>; +def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG); +}]>; + + +def vmrgew_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 0, *CurDAG); +}]>; +def vmrgow_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 0, *CurDAG); +}]>; +def vmrgew_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 1, *CurDAG); +}]>; +def vmrgow_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 1, *CurDAG); +}]>; +def vmrgew_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 2, *CurDAG); +}]>; +def vmrgow_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 2, *CurDAG); +}]>; + + + +def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG), SDLoc(N)); +}]>; +def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1; +}], VSLDOI_get_imm>; + + +/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into +/// vector_shuffle(X,undef,mask) by the dag combiner. +def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG), SDLoc(N)); +}]>; +def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1; +}], VSLDOI_unary_get_imm>; + + +/// VSLDOI_swapped* - These fragments are provided for little-endian, where +/// the inputs must be swapped for correct semantics. +def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG), SDLoc(N)); +}]>; +def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1; +}], VSLDOI_get_imm>; + + +// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. +def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N)); +}]>; +def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1); +}], VSPLTB_get_imm>; +def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N)); +}]>; +def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2); +}], VSPLTH_get_imm>; +def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N)); +}]>; +def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4); +}], VSPLTW_get_imm>; + + +// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm. +def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG); +}]>; +def vecspltisb : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0; +}], VSPLTISB_get_imm>; + +// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm. +def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG); +}]>; +def vecspltish : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0; +}], VSPLTISH_get_imm>; + +// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm. +def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG); +}]>; +def vecspltisw : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0; +}], VSPLTISW_get_imm>; + +//===----------------------------------------------------------------------===// +// Helpers for defining instructions that directly correspond to intrinsics. + +// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type. +class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>; + +// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>; + +// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two +// input types and an output type. +class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType In1Ty, ValueType In2Ty> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set OutTy:$vD, + (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>; + +// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type. +class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>; + +// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>; + +// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two +// input types and an output type. +class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType In1Ty, ValueType In2Ty> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>; + +// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type. +class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID> + : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB), + !strconcat(opc, " $vD, $vB"), IIC_VecFP, + [(set v4f32:$vD, (IntID v4f32:$vB))]>; + +// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB), + !strconcat(opc, " $vD, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vB))]>; + +class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_BX<xo, (outs vrrc:$vD), (ins vrrc:$vA), + !strconcat(opc, " $vD, $vA"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA))]>; + +class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX), + !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>; + +//===----------------------------------------------------------------------===// +// Instruction Definitions. + +def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">; +let Predicates = [HasAltivec] in { + +def DSS : DSS_Form<0, 822, (outs), (ins u5imm:$STRM), + "dss $STRM", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dss imm:$STRM)]>, + Deprecated<DeprecatedDST> { + let A = 0; + let B = 0; +} + +def DSSALL : DSS_Form<1, 822, (outs), (ins), + "dssall", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dssall)]>, + Deprecated<DeprecatedDST> { + let STRM = 0; + let A = 0; + let B = 0; +} + +def DST : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB), + "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + +def DSTT : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB), + "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + +def DSTST : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB), + "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + +def DSTSTT : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB), + "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + +let isCodeGenOnly = 1 in { + // The very same instructions as above, but formally matching 64bit registers. + def DST64 : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + + def DSTT64 : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM)]>, + Deprecated<DeprecatedDST>; + + def DSTST64 : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dstst i64:$rA, i32:$rB, + imm:$STRM)]>, + Deprecated<DeprecatedDST>; + + def DSTSTT64 : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, + [(int_ppc_altivec_dststt i64:$rA, i32:$rB, + imm:$STRM)]>, + Deprecated<DeprecatedDST>; +} + +def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins), + "mfvscr $vD", IIC_LdStStore, + [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; +def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), + "mtvscr $vB", IIC_LdStLoad, + [(int_ppc_altivec_mtvscr v4i32:$vB)]>; + +let PPC970_Unit = 2 in { // Loads. +def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src), + "lvebx $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; +def LVEHX: XForm_1<31, 39, (outs vrrc:$vD), (ins memrr:$src), + "lvehx $vD, $src", IIC_LdStLoad, + [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>; +def LVEWX: XForm_1<31, 71, (outs vrrc:$vD), (ins memrr:$src), + "lvewx $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>; +def LVX : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src), + "lvx $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>; +def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src), + "lvxl $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>; +} + +def LVSL : XForm_1<31, 6, (outs vrrc:$vD), (ins memrr:$src), + "lvsl $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>, + PPC970_Unit_LSU; +def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src), + "lvsr $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>, + PPC970_Unit_LSU; + +let PPC970_Unit = 2 in { // Stores. +def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst), + "stvebx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>; +def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst), + "stvehx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>; +def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst), + "stvewx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>; +def STVX : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst), + "stvx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>; +def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst), + "stvxl $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>; +} + +let PPC970_Unit = 5 in { // VALU Operations. +// VA-Form instructions. 3-input AltiVec ops. +let isCommutable = 1 in { +def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB), + "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP, + [(set v4f32:$vD, + (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>; + +// FIXME: The fma+fneg pattern won't match because fneg is not legal. +def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB), + "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP, + [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC, + (fneg v4f32:$vB))))]>; + +def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>; +def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs, + v8i16>; +def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>; +} // isCommutable + +def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm, + v4i32, v4i32, v16i8>; +def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>; + +// Shuffles. +def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH), + "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP, + [(set v16i8:$vD, + (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>; + +// VX-Form instructions. AltiVec arithmetic ops. +let isCommutable = 1 in { +def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddfp $vD, $vA, $vB", IIC_VecFP, + [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>; + +def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddubm $vD, $vA, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>; +def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vadduhm $vD, $vA, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>; +def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vadduwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>; + +def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>; +def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>; +def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>; +def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>; +def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>; +def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>; +def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>; +} // isCommutable + +let isCommutable = 1 in +def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vand $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>; +def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vandc $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (and v4i32:$vA, + (vnot_ppc v4i32:$vB)))]>; + +def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vcfsx $vD, $vB, $UIMM", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>; +def VCFUX : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vcfux $vD, $vB, $UIMM", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>; +def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vctsxs $vD, $vB, $UIMM", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>; +def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vctuxs $vD, $vB, $UIMM", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>; + +// Defines with the UIM field set to 0 for floating-point +// to integer (fp_to_sint/fp_to_uint) conversions and integer +// to floating-point (sint_to_fp/uint_to_fp) conversions. +let isCodeGenOnly = 1, VA = 0 in { +def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB), + "vcfsx $vD, $vB, 0", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>; +def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB), + "vctuxs $vD, $vB, 0", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>; +def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB), + "vcfux $vD, $vB, 0", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfux v4i32:$vB, 0))]>; +def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB), + "vctsxs $vD, $vB, 0", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>; +} +def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>; +def VLOGEFP : VX2_Int_SP<458, "vlogefp", int_ppc_altivec_vlogefp>; + +let isCommutable = 1 in { +def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>; +def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>; +def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>; +def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>; +def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>; +def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>; + +def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>; +def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>; +def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>; +def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>; +def VMAXUB : VX1_Int_Ty< 2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>; +def VMAXUH : VX1_Int_Ty< 66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>; +def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>; +def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>; +def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>; +def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>; +def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>; +def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>; +def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>; +def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>; +} // isCommutable + +def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghb $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghh $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghw $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglb $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglh $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglw $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>; + +def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm, + v4i32, v16i8, v4i32>; +def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm, + v4i32, v8i16, v4i32>; +def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs, + v4i32, v8i16, v4i32>; +def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm, + v4i32, v16i8, v4i32>; +def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm, + v4i32, v8i16, v4i32>; +def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs, + v4i32, v8i16, v4i32>; + +let isCommutable = 1 in { +def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb, + v8i16, v16i8>; +def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh, + v4i32, v8i16>; +def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub, + v8i16, v16i8>; +def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh, + v4i32, v8i16>; +def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb, + v8i16, v16i8>; +def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh, + v4i32, v8i16>; +def VMULOUB : VX1_Int_Ty2< 8, "vmuloub", int_ppc_altivec_vmuloub, + v8i16, v16i8>; +def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh, + v4i32, v8i16>; +} // isCommutable + +def VREFP : VX2_Int_SP<266, "vrefp", int_ppc_altivec_vrefp>; +def VRFIM : VX2_Int_SP<714, "vrfim", int_ppc_altivec_vrfim>; +def VRFIN : VX2_Int_SP<522, "vrfin", int_ppc_altivec_vrfin>; +def VRFIP : VX2_Int_SP<650, "vrfip", int_ppc_altivec_vrfip>; +def VRFIZ : VX2_Int_SP<586, "vrfiz", int_ppc_altivec_vrfiz>; +def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>; + +def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>; + +def VSUBFP : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubfp $vD, $vA, $vB", IIC_VecGeneral, + [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>; +def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsububm $vD, $vA, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>; +def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubuhm $vD, $vA, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>; +def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubuwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>; + +def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>; +def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>; +def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>; +def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>; +def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>; +def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>; + +def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>; +def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>; + +def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs, + v4i32, v16i8, v4i32>; +def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs, + v4i32, v8i16, v4i32>; +def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs, + v4i32, v16i8, v4i32>; + +def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vnor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA, + v4i32:$vB)))]>; +let isCommutable = 1 in { +def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>; +def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vxor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>; +} // isCommutable + +def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>; +def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>; +def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>; + +def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32 >; +def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>; + +def VSLB : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>; +def VSLH : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>; +def VSLW : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>; + +def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vspltb $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>; +def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vsplth $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>; +def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vspltw $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>; + +def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>; +def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>; + +def VSRAB : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>; +def VSRAH : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>; +def VSRAW : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>; +def VSRB : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>; +def VSRH : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>; +def VSRW : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>; + + +def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltisb $vD, $SIMM", IIC_VecPerm, + [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>; +def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltish $vD, $SIMM", IIC_VecPerm, + [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>; +def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltisw $vD, $SIMM", IIC_VecPerm, + [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>; + +// Vector Pack. +def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx, + v8i16, v4i32>; +def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, + v16i8, v8i16>; +def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, + v16i8, v8i16>; +def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, + v16i8, v4i32>; +def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, + v8i16, v4i32>; +def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vpkuhum $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, + (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>; +def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus, + v16i8, v8i16>; +def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vpkuwum $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, + (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>; +def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus, + v8i16, v4i32>; + +// Vector Unpack. +def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx, + v4i32, v8i16>; +def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb, + v8i16, v16i8>; +def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh, + v4i32, v8i16>; +def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx, + v4i32, v8i16>; +def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb, + v8i16, v16i8>; +def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh, + v4i32, v8i16>; + + +// Altivec Comparisons. + +class VCMP<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, + IIC_VecFPCompare, + [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>; +class VCMPo<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, + IIC_VecFPCompare, + [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> { + let Defs = [CR6]; + let RC = 1; +} + +// f32 element comparisons.0 +def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>; +def VCMPBFPo : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; +def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>; +def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; +def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>; +def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; +def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>; +def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; + +// i8 element comparisons. +def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>; +def VCMPEQUBo : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; +def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>; +def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; +def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>; +def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; + +// i16 element comparisons. +def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>; +def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; +def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>; +def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; +def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>; +def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; + +// i32 element comparisons. +def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>; +def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; +def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; +def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; +def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; +def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; + +let isCodeGenOnly = 1 in { +def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v16i8:$vD, (v16i8 immAllZerosV))]>; +def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v8i16:$vD, (v8i16 immAllZerosV))]>; +def V_SET0 : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v4i32:$vD, (v4i32 immAllZerosV))]>; + +let IMM=-1 in { +def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v16i8:$vD, (v16i8 immAllOnesV))]>; +def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v8i16:$vD, (v8i16 immAllOnesV))]>; +def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v4i32:$vD, (v4i32 immAllOnesV))]>; +} +} +} // VALU Operations. + +//===----------------------------------------------------------------------===// +// Additional Altivec Patterns +// + +// Loads. +def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>; + +// Stores. +def : Pat<(store v4i32:$rS, xoaddr:$dst), + (STVX $rS, xoaddr:$dst)>; + +// Bit conversions. +def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v1i128 VRRC:$src))), (v16i8 VRRC:$src)>; + +def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v1i128 VRRC:$src))), (v8i16 VRRC:$src)>; + +def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v1i128 VRRC:$src))), (v4i32 VRRC:$src)>; + +def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v1i128 VRRC:$src))), (v4f32 VRRC:$src)>; + +def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>; +def : Pat<(v2i64 (bitconvert (v1i128 VRRC:$src))), (v2i64 VRRC:$src)>; + +def : Pat<(v1i128 (bitconvert (v16i8 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(v1i128 (bitconvert (v8i16 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>; + +// Shuffles. + +// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x) +def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef), + (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>; +def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef), + (VPKUWUM $vA, $vA)>; +def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef), + (VPKUHUM $vA, $vA)>; + +// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands. +// These fragments are matched for little-endian, where the inputs must +// be swapped for correct semantics. +def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB), + (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>; +def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VPKUWUM $vB, $vA)>; +def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VPKUHUM $vB, $vA)>; + +// Match vmrg*(x,x) +def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef), + (VMRGLB $vA, $vA)>; +def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef), + (VMRGLH $vA, $vA)>; +def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef), + (VMRGLW $vA, $vA)>; +def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef), + (VMRGHB $vA, $vA)>; +def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef), + (VMRGHH $vA, $vA)>; +def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef), + (VMRGHW $vA, $vA)>; + +// Match vmrg*(y,x), i.e., swapped operands. These fragments +// are matched for little-endian, where the inputs must be +// swapped for correct semantics. +def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLB $vB, $vA)>; +def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLH $vB, $vA)>; +def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLW $vB, $vA)>; +def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHB $vB, $vA)>; +def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHH $vB, $vA)>; +def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHW $vB, $vA)>; + +// Logical Operations +def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>; + +def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)), + (VNOR $A, $B)>; +def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)), + (VANDC $A, $B)>; + +def : Pat<(fmul v4f32:$vA, v4f32:$vB), + (VMADDFP $vA, $vB, + (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; + +// Fused multiply add and multiply sub for packed float. These are represented +// separately from the real instructions above, for operations that must have +// the additional precision, such as Newton-Rhapson (used by divide, sqrt) +def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C), + (VMADDFP $A, $B, $C)>; +def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C), + (VNMSUBFP $A, $B, $C)>; + +def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C), + (VMADDFP $A, $B, $C)>; +def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C), + (VNMSUBFP $A, $B, $C)>; + +def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC), + (VPERM $vA, $vB, $vC)>; + +def : Pat<(PPCfre v4f32:$A), (VREFP $A)>; +def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>; + +// Vector shifts +def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSLB $vA, $vB))>; +def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSLH $vA, $vB))>; +def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSLW $vA, $vB))>; + +def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRB $vA, $vB))>; +def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRH $vA, $vB))>; +def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRW $vA, $vB))>; + +def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRAB $vA, $vB))>; +def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRAH $vA, $vB))>; +def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRAW $vA, $vB))>; + +// Float to integer and integer to float conversions +def : Pat<(v4i32 (fp_to_sint v4f32:$vA)), + (VCTSXS_0 $vA)>; +def : Pat<(v4i32 (fp_to_uint v4f32:$vA)), + (VCTUXS_0 $vA)>; +def : Pat<(v4f32 (sint_to_fp v4i32:$vA)), + (VCFSX_0 $vA)>; +def : Pat<(v4f32 (uint_to_fp v4i32:$vA)), + (VCFUX_0 $vA)>; + +// Floating-point rounding +def : Pat<(v4f32 (ffloor v4f32:$vA)), + (VRFIM $vA)>; +def : Pat<(v4f32 (fceil v4f32:$vA)), + (VRFIP $vA)>; +def : Pat<(v4f32 (ftrunc v4f32:$vA)), + (VRFIZ $vA)>; +def : Pat<(v4f32 (fnearbyint v4f32:$vA)), + (VRFIN $vA)>; + +} // end HasAltivec + +def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">; +def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">; +let Predicates = [HasP8Altivec] in { + +let isCommutable = 1 in { +def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw, + v2i64, v4i32>; +def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw, + v2i64, v4i32>; +def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw, + v2i64, v4i32>; +def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw, + v2i64, v4i32>; +def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmuluwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>; +def VMAXSD : VX1_Int_Ty<450, "vmaxsd", int_ppc_altivec_vmaxsd, v2i64>; +def VMAXUD : VX1_Int_Ty<194, "vmaxud", int_ppc_altivec_vmaxud, v2i64>; +def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>; +def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>; +} // isCommutable + +// Vector merge +def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrgew $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrgew_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrgow $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrgow_shuffle v16i8:$vA, v16i8:$vB))]>; + +// Match vmrgew(x,x) and vmrgow(x,x) +def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef), + (VMRGEW $vA, $vA)>; +def:Pat<(vmrgow_unary_shuffle v16i8:$vA, undef), + (VMRGOW $vA, $vA)>; + +// Match vmrgew(y,x) and vmrgow(y,x), i.e., swapped operands. These fragments +// are matched for little-endian, where the inputs must be swapped for correct +// semantics.w +def:Pat<(vmrgew_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGEW $vB, $vA)>; +def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGOW $vB, $vA)>; + + +// Vector shifts +def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; +def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsld $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; +def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsrd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>; +def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsrad $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>; + +// Vector Integer Arithmetic Instructions +let isCommutable = 1 in { +def VADDUDM : VXForm_1<192, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddudm $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (add v2i64:$vA, v2i64:$vB))]>; +def VADDUQM : VXForm_1<256, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vadduqm $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (add v1i128:$vA, v1i128:$vB))]>; +} // isCommutable + +// Vector Quadword Add +def VADDEUQM : VA1a_Int_Ty<60, "vaddeuqm", int_ppc_altivec_vaddeuqm, v1i128>; +def VADDCUQ : VX1_Int_Ty<320, "vaddcuq", int_ppc_altivec_vaddcuq, v1i128>; +def VADDECUQ : VA1a_Int_Ty<61, "vaddecuq", int_ppc_altivec_vaddecuq, v1i128>; + +// Vector Doubleword Subtract +def VSUBUDM : VXForm_1<1216, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubudm $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (sub v2i64:$vA, v2i64:$vB))]>; + +// Vector Quadword Subtract +def VSUBUQM : VXForm_1<1280, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubuqm $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (sub v1i128:$vA, v1i128:$vB))]>; +def VSUBEUQM : VA1a_Int_Ty<62, "vsubeuqm", int_ppc_altivec_vsubeuqm, v1i128>; +def VSUBCUQ : VX1_Int_Ty<1344, "vsubcuq", int_ppc_altivec_vsubcuq, v1i128>; +def VSUBECUQ : VA1a_Int_Ty<63, "vsubecuq", int_ppc_altivec_vsubecuq, v1i128>; + +// Count Leading Zeros +def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB), + "vclzb $vD, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (ctlz v16i8:$vB))]>; +def VCLZH : VXForm_2<1858, (outs vrrc:$vD), (ins vrrc:$vB), + "vclzh $vD, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (ctlz v8i16:$vB))]>; +def VCLZW : VXForm_2<1922, (outs vrrc:$vD), (ins vrrc:$vB), + "vclzw $vD, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (ctlz v4i32:$vB))]>; +def VCLZD : VXForm_2<1986, (outs vrrc:$vD), (ins vrrc:$vB), + "vclzd $vD, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (ctlz v2i64:$vB))]>; + +// Population Count +def VPOPCNTB : VXForm_2<1795, (outs vrrc:$vD), (ins vrrc:$vB), + "vpopcntb $vD, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (ctpop v16i8:$vB))]>; +def VPOPCNTH : VXForm_2<1859, (outs vrrc:$vD), (ins vrrc:$vB), + "vpopcnth $vD, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (ctpop v8i16:$vB))]>; +def VPOPCNTW : VXForm_2<1923, (outs vrrc:$vD), (ins vrrc:$vB), + "vpopcntw $vD, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (ctpop v4i32:$vB))]>; +def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB), + "vpopcntd $vD, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (ctpop v2i64:$vB))]>; + +let isCommutable = 1 in { +// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the +// VSX equivalents. We need to fix this up at some point. Two possible +// solutions for this problem: +// 1. Disable Altivec patterns that compete with VSX patterns using the +// !HasVSX predicate. This essentially favours VSX over Altivec, in +// hopes of reducing register pressure (larger register set using VSX +// instructions than VMX instructions) +// 2. Employ a more disciplined use of AddedComplexity, which would provide +// more fine-grained control than option 1. This would be beneficial +// if we find situations where Altivec is really preferred over VSX. +def VEQV : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "veqv $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>; +def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vnand $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>; +} // isCommutable + +def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vorc $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (or v4i32:$vA, + (vnot_ppc v4i32:$vB)))]>; + +// i64 element comparisons. +def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>; +def VCMPEQUDo : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>; +def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>; +def VCMPGTSDo : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; +def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>; +def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; + +// The cryptography instructions that do not require Category:Vector.Crypto +def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb", + int_ppc_altivec_crypto_vpmsumb, v16i8>; +def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh", + int_ppc_altivec_crypto_vpmsumh, v8i16>; +def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw", + int_ppc_altivec_crypto_vpmsumw, v4i32>; +def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd", + int_ppc_altivec_crypto_vpmsumd, v2i64>; +def VPERMXOR : VA1a_Int_Ty<45, "vpermxor", + int_ppc_altivec_crypto_vpermxor, v16i8>; + +// Vector doubleword integer pack and unpack. +def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss, + v4i32, v2i64>; +def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus, + v4i32, v2i64>; +def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vpkudum $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, + (vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>; +def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus, + v4i32, v2i64>; +def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw, + v2i64, v4i32>; +def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw, + v2i64, v4i32>; + +// Shuffle patterns for unary and swapped (LE) vector pack modulo. +def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef), + (VPKUDUM $vA, $vA)>; +def:Pat<(vpkudum_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VPKUDUM $vB, $vA)>; + +def VGBBD : VX2_Int_Ty2<1292, "vgbbd", int_ppc_altivec_vgbbd, v16i8, v16i8>; +def VBPERMQ : VX1_Int_Ty2<1356, "vbpermq", int_ppc_altivec_vbpermq, + v2i64, v16i8>; +} // end HasP8Altivec + +// Crypto instructions (from builtins) +let Predicates = [HasP8Crypto] in { +def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw", + int_ppc_altivec_crypto_vshasigmaw, v4i32>; +def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad", + int_ppc_altivec_crypto_vshasigmad, v2i64>; +def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher, + v2i64>; +def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast", + int_ppc_altivec_crypto_vcipherlast, v2i64>; +def VNCIPHER : VX1_Int_Ty<1352, "vncipher", + int_ppc_altivec_crypto_vncipher, v2i64>; +def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast", + int_ppc_altivec_crypto_vncipherlast, v2i64>; +def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>; +} // HasP8Crypto diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h new file mode 100644 index 0000000..cf71b1c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h @@ -0,0 +1,43 @@ +//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to simplify generating frame and constant pool +// references. +// +// For reference, the order of operands for memory references is: +// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate +// Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H +#define LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +namespace llvm { + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +static inline const MachineInstrBuilder& +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0, + bool mem = true) { + if (mem) + return MIB.addImm(Offset).addFrameIndex(FI); + else + return MIB.addFrameIndex(FI).addImm(Offset); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td new file mode 100644 index 0000000..4e03ed2 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -0,0 +1,1675 @@ +//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// PowerPC instruction formats + +class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : Instruction { + field bits<32> Inst; + field bits<32> SoftFail = 0; + let Size = 4; + + bit PPC64 = 0; // Default value, override with isPPC64 + + let Namespace = "PPC"; + let Inst{0-5} = opcode; + let OutOperandList = OOL; + let InOperandList = IOL; + let AsmString = asmstr; + let Itinerary = itin; + + bits<1> PPC970_First = 0; + bits<1> PPC970_Single = 0; + bits<1> PPC970_Cracked = 0; + bits<3> PPC970_Unit = 0; + + /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to + /// these must be reflected there! See comments there for what these are. + let TSFlags{0} = PPC970_First; + let TSFlags{1} = PPC970_Single; + let TSFlags{2} = PPC970_Cracked; + let TSFlags{5-3} = PPC970_Unit; + + // Fields used for relation models. + string BaseName = ""; + + // For cases where multiple instruction definitions really represent the + // same underlying instruction but with one definition for 64-bit arguments + // and one for 32-bit arguments, this bit breaks the degeneracy between + // the two forms and allows TableGen to generate mapping tables. + bit Interpretation64Bit = 0; +} + +class PPC970_DGroup_First { bits<1> PPC970_First = 1; } +class PPC970_DGroup_Single { bits<1> PPC970_Single = 1; } +class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; } +class PPC970_MicroCode; + +class PPC970_Unit_Pseudo { bits<3> PPC970_Unit = 0; } +class PPC970_Unit_FXU { bits<3> PPC970_Unit = 1; } +class PPC970_Unit_LSU { bits<3> PPC970_Unit = 2; } +class PPC970_Unit_FPU { bits<3> PPC970_Unit = 3; } +class PPC970_Unit_CRU { bits<3> PPC970_Unit = 4; } +class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; } +class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; } +class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; } + +// Two joined instructions; used to emit two adjacent instructions as one. +// The itinerary from the first instruction is used for scheduling and +// classification. +class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : Instruction { + field bits<64> Inst; + field bits<64> SoftFail = 0; + let Size = 8; + + bit PPC64 = 0; // Default value, override with isPPC64 + + let Namespace = "PPC"; + let Inst{0-5} = opcode1; + let Inst{32-37} = opcode2; + let OutOperandList = OOL; + let InOperandList = IOL; + let AsmString = asmstr; + let Itinerary = itin; + + bits<1> PPC970_First = 0; + bits<1> PPC970_Single = 0; + bits<1> PPC970_Cracked = 0; + bits<3> PPC970_Unit = 0; + + /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to + /// these must be reflected there! See comments there for what these are. + let TSFlags{0} = PPC970_First; + let TSFlags{1} = PPC970_Single; + let TSFlags{2} = PPC970_Cracked; + let TSFlags{5-3} = PPC970_Unit; + + // Fields used for relation models. + string BaseName = ""; + bit Interpretation64Bit = 0; +} + +// 1.7.1 I-Form +class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + let Pattern = pattern; + bits<24> LI; + + let Inst{6-29} = LI; + let Inst{30} = aa; + let Inst{31} = lk; +} + +// 1.7.2 B-Form +class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + bits<14> BD; + + bits<5> BI; + let BI{0-1} = BIBO{5-6}; + let BI{2-4} = CR{0-2}; + + let Inst{6-10} = BIBO{4-0}; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL, + string asmstr> + : BForm<opcode, aa, lk, OOL, IOL, asmstr> { + let BIBO{4-0} = bo; + let BIBO{6-5} = 0; + let CR = 0; +} + +class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<14> BD; + + let Inst{6-10} = bo; + let Inst{11-15} = bi; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_3<bits<6> opcode, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<5> BO; + bits<5> BI; + bits<14> BD; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<5> BI; + bits<14> BD; + + let Inst{6-10} = bo; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +// 1.7.3 SC-Form +class SCForm<bits<6> opcode, bits<1> xo, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<7> LEV; + + let Pattern = pattern; + + let Inst{20-26} = LEV; + let Inst{30} = xo; +} + +// 1.7.4 D-Form +class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> B; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<21> Addr; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = Addr{20-16}; // Base Reg + let Inst{16-31} = Addr{15-0}; // Displacement +} + +class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<16> C; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + + +class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern> { + + // Even though ADDICo does not really have an RC bit, provide + // the declaration of one here so that isDOT has something to set. + bit RC = 0; +} + +class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<16> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = 0; + let Inst{16-31} = B; +} + +class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> B; + bits<5> A; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let Addr = 0; +} + +class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> { + let A = R; + let B = R; + let C = 0; +} + +class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<21> Addr; + + let Pattern = pattern; + bits<24> LI; + + let Inst{6-29} = LI; + let Inst{30} = aa; + let Inst{31} = lk; + + let Inst{38-42} = A; + let Inst{43-47} = Addr{20-16}; // Base Reg + let Inst{48-63} = Addr{15-0}; // Displacement +} + +// This is used to emit BL8+NOP. +class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : IForm_and_DForm_1<opcode1, aa, lk, opcode2, + OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let Addr = 0; +} + +class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<16> I; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-31} = I; +} + +class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_5<opcode, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + +class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_5<opcode, OOL, IOL, asmstr, itin>; + +class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_6<opcode, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + + +// 1.7.5 DS-Form +class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RST; + bits<19> DS_RA; + + let Pattern = pattern; + + let Inst{6-10} = RST; + let Inst{11-15} = DS_RA{18-14}; // Register # + let Inst{16-29} = DS_RA{13-0}; // Displacement. + let Inst{30-31} = xo; +} + + +// 1.7.6 X-Form +class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RST; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> { + let RST = 0; +} + +class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + let Inst{21-30} = xo; +} + +// This is the same as XForm_base_r3xo, but the first two operands are swapped +// when code is emitted. +class XForm_base_r3xo_swapped + <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> RST; + bits<5> B; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + + +class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>; + +class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; +} + +class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let B = 0; +} + +class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RST; + bits<5> A; + bits<1> WS; + + let Pattern = pattern; + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{20} = WS; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>; + +class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let B = 0; + let Pattern = pattern; +} + +class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<5> RB; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<4> CT; + bits<5> RA; + bits<5> RB; + + let Inst{6} = 0; + let Inst{7-10} = CT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RS; + bits<4> SR; + + let Inst{6-10} = RS; + let Inst{12-15} = SR; + let Inst{21-30} = xo; +} + +class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> MO; + + let Inst{6-10} = MO; + let Inst{21-30} = xo; +} + +class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RS; + bits<5> RB; + + let Inst{6-10} = RS; + let Inst{16-20} = RB; + let Inst{21-30} = xo; +} + +class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RS; + bits<1> L; + + let Inst{6-10} = RS; + let Inst{15} = L; + let Inst{21-30} = xo; +} + +class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + +class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<5> FRA; + bits<5> FRB; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// Used for QPX +class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRB; + + let Pattern = pattern; + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_19<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_18<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRA = 0; +} + +class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRB; + bits<4> tttt; + + let Pattern = pattern; + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-24} = tttt; + let Inst{25-30} = xo; + let Inst{31} = 0; +} + +class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + let Pattern = pattern; + let Inst{6-10} = 31; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<2> L; + + let Pattern = pattern; + let Inst{6-8} = 0; + let Inst{9-10} = L; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let L = 0; +} + +class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { +} + +class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let A = 0; +} + +class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { +} + +// This is used for MFFS, MTFSB0, MTFSB1. 42 is arbitrary; this series of +// numbers presumably relates to some document, but I haven't found it. +class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} +class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let Pattern = pattern; + bits<5> FM; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FM; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; + let A = 0; + let B = 0; +} + +class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; + let A = 0; +} + +class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit R; + + bit RC = 1; + + let Inst{6-9} = 0; + let Inst{10} = R; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit A; + + bit RC = 1; + + let Inst{6} = A; + let Inst{7-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit L; + + bit RC = 0; // set by isDOT + + let Inst{7-9} = 0; + let Inst{10} = L; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + + bit RC = 0; + + let Inst{6-8} = BF; + let Inst{9-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +// XX*-Form (VSX) +class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = XT{5}; +} + +class XX1_RS6_RD5_XO<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let B = 0; +} + +class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = 0; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> CR; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-8} = CR; + let Inst{9-15} = 0; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = 0; +} + +class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XB; + bits<2> D; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-13} = 0; + let Inst{14-15} = D; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> CR; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-8} = CR; + let Inst{9-10} = 0; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = 0; +} + +class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + bits<2> D; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21} = 0; + let Inst{22-23} = D; + let Inst{24-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21} = RC; + let Inst{22-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + bits<6> XC; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-25} = XC{4-0}; + let Inst{26-27} = xo; + let Inst{28} = XC{5}; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +// DCB_Form - Form X instruction, used for dcb* instructions. +class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = immfield; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OOL, IOL, asmstr, itin> { + bits<5> TH; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = TH; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// DSS_Form - Form X instruction, used for altivec dss* instructions. +class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OOL, IOL, asmstr, itin> { + bits<2> STRM; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6} = T; + let Inst{7-8} = 0; + let Inst{9-10} = STRM; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// 1.7.7 XL-Form +class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> CRD; + bits<5> CRA; + bits<5> CRB; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRA; + let Inst{16-20} = CRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> CRD; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRD; + let Inst{16-20} = CRD; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> BO; + bits<5> BI; + bits<2> BH; + + let Pattern = pattern; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-18} = 0; + let Inst{19-20} = BH; + let Inst{21-30} = xo; + let Inst{31} = lk; +} + +class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + + let BO = BIBO{4-0}; + let BI{0-1} = BIBO{5-6}; + let BI{2-4} = CR{0-2}; + let BH = 0; +} + +class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + let BO = bo; + let BH = 0; +} + +class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo, bits<5> bi, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + let BO = bo; + let BI = bi; + let BH = 0; +} + +class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<3> BFA; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-13} = BFA; + let Inst{14-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bit W; + bits<4> U; + + bit RC = 0; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-14} = 0; + let Inst{15} = W; + let Inst{16-19} = U; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<1> S; + + let Pattern = pattern; + + let Inst{6-19} = 0; + let Inst{20} = S; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk, + bits<6> opcode2, bits<2> xo2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> { + bits<5> BO; + bits<5> BI; + bits<2> BH; + + bits<5> RST; + bits<19> DS_RA; + + let Pattern = pattern; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-18} = 0; + let Inst{19-20} = BH; + let Inst{21-30} = xo1; + let Inst{31} = lk; + + let Inst{38-42} = RST; + let Inst{43-47} = DS_RA{18-14}; // Register # + let Inst{48-61} = DS_RA{13-0}; // Displacement. + let Inst{62-63} = xo2; +} + +class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1, + bits<5> bo, bits<5> bi, bit lk, + bits<6> opcode2, bits<2> xo2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XLForm_2_and_DSForm_1<opcode1, xo1, lk, opcode2, xo2, + OOL, IOL, asmstr, itin, pattern> { + let BO = bo; + let BI = bi; + let BH = 0; +} + +// 1.7.8 XFX-Form +class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<10> SPR; + + let Inst{6-10} = RT; + let Inst{11} = SPR{4}; + let Inst{12} = SPR{3}; + let Inst{13} = SPR{2}; + let Inst{14} = SPR{1}; + let Inst{15} = SPR{0}; + let Inst{16} = SPR{9}; + let Inst{17} = SPR{8}; + let Inst{18} = SPR{7}; + let Inst{19} = SPR{6}; + let Inst{20} = SPR{5}; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> { + let SPR = spr; +} + +class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + + let Inst{6-10} = RT; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<10> Entry; + let Pattern = pattern; + + let Inst{6-10} = RT; + let Inst{11-20} = Entry; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<8> FXM; + bits<5> rS; + + let Inst{6-10} = rS; + let Inst{11} = 0; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> ST; + bits<8> FXM; + + let Inst{6-10} = ST; + let Inst{11} = 1; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>; + +class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> { + let SPR = spr; +} + +// XFL-Form - MTFSF +// This is probably 1.7.9, but I don't have the reference that uses this +// numbering scheme... +class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag>pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<8> FM; + bits<5> rT; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6} = 0; + let Inst{7-14} = FM; + let Inst{15} = 0; + let Inst{16-20} = rT; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag>pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bit L; + bits<8> FLM; + bit W; + bits<5> FRB; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6} = L; + let Inst{7-14} = FLM; + let Inst{15} = W; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +// 1.7.10 XS-Form - SRADI. +class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> RS; + bits<6> SH; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6-10} = RS; + let Inst{11-15} = A; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + +// 1.7.11 XO-Form +class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21} = oe; + let Inst{22-30} = xo; + let Inst{31} = RC; +} + +class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> { + let RB = 0; +} + +// 1.7.12 A-Form +class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRC; + bits<5> FRB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-25} = FRC; + let Inst{26-30} = xo; + let Inst{31} = RC; +} + +class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRC = 0; +} + +class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRB = 0; +} + +class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + bits<5> COND; + + let Pattern = pattern; + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-25} = COND; + let Inst{26-30} = xo; + let Inst{31} = 0; +} + +// Used for QPX +class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRA = 0; + let FRC = 0; +} + +// 1.7.13 M-Form +class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<5> RB; + bits<5> MB; + bits<5> ME; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-25} = MB; + let Inst{26-30} = ME; + let Inst{31} = RC; +} + +class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> { +} + +// 1.7.14 MD-Form +class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<6> SH; + bits<6> MBE; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-26} = MBE{4,3,2,1,0,5}; + let Inst{27-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + +class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<5> RB; + bits<6> MBE; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-26} = MBE{4,3,2,1,0,5}; + let Inst{27-30} = xo; + let Inst{31} = RC; +} + + +// E-1 VA-Form + +// VAForm_1 - DACB ordering. +class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VC; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +// VAForm_1a - DABC ordering. +class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<5> VC; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<4> SH; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = 0; + let Inst{22-25} = SH; + let Inst{26-31} = xo; +} + +// E-2 VX-Form +class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> { + let VA = VD; + let VB = VD; +} + + +class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> IMM; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = IMM; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr. +class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr. +class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = 0; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +/// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX" +class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<1> ST; + bits<4> SIX; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16} = ST; + let Inst{17-20} = SIX; + let Inst{21-31} = xo; +} + +/// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox" +class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +// E-4 VXR-Form +class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bit RC = 0; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = RC; + let Inst{22-31} = xo; +} + +// Z23-Form (used by QPX) +class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRB; + bits<2> idx; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-22} = idx; + let Inst{23-30} = xo; + let Inst{31} = RC; +} + +class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRB = 0; +} + +class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<12> idx; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FRT; + let Inst{11-22} = idx; + let Inst{23-30} = xo; + let Inst{31} = RC; +} + +//===----------------------------------------------------------------------===// +class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern> + : I<0, OOL, IOL, asmstr, NoItinerary> { + let isCodeGenOnly = 1; + let PPC64 = 0; + let Pattern = pattern; + let Inst{31-0} = 0; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td new file mode 100644 index 0000000..6c4e212 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td @@ -0,0 +1,172 @@ +//===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hardware Transactional Memory extension to the +// PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + + + +def HasHTM : Predicate<"PPCSubTarget->hasHTM()">; + +def HTM_get_imm : SDNodeXForm<imm, [{ + return getI32Imm (N->getZExtValue(), SDLoc(N)); +}]>; + +let hasSideEffects = 1, usesCustomInserter = 1 in { +def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>; +} + + +let Predicates = [HasHTM] in { + +def TBEGIN : XForm_htm0 <31, 654, + (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>; + +def TEND : XForm_htm1 <31, 686, + (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>; + +def TABORT : XForm_base_r3xo <31, 910, + (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR, + []>, isDOT { + let RST = 0; + let B = 0; +} + +def TABORTWC : XForm_base_r3xo <31, 782, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTWCI : XForm_base_r3xo <31, 846, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTDC : XForm_base_r3xo <31, 814, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B), + "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TABORTDCI : XForm_base_r3xo <31, 878, + (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B), + "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>, + isDOT; + +def TSR : XForm_htm2 <31, 750, + (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>, + isDOT; + +def TCHECK : XForm_htm3 <31, 718, + (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>; + + +def TRECLAIM : XForm_base_r3xo <31, 942, + (outs crrc:$ret), (ins gprc:$A), "treclaim. $A", + IIC_SprMTSPR, []>, + isDOT { + let RST = 0; + let B = 0; +} + +def TRECHKPT : XForm_base_r3xo <31, 1006, + (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>, + isDOT { + let RST = 0; + let A = 0; + let B = 0; +} + +// Builtins + +// All HTM instructions, with the exception of tcheck, set CR0 with the +// value of the MSR Transaction State (TS) bits that exist before the +// instruction is executed. For tbegin., the EQ bit in CR0 can be used +// to determine whether the transaction was successfully started (0) or +// failed (1). We use an XORI pattern to 'flip' the bit to match the +// tbegin builtin API which defines a return value of 1 as success. + +def : Pat<(int_ppc_tbegin i32:$R), + (XORI + (EXTRACT_SUBREG ( + TBEGIN (HTM_get_imm imm:$R)), sub_eq), + 1)>; + +def : Pat<(int_ppc_tend i32:$R), + (TEND (HTM_get_imm imm:$R))>; + + +def : Pat<(int_ppc_tabort i32:$R), + (TABORT $R)>; + +def : Pat<(int_ppc_tabortwc i32:$TO, i32:$RA, i32:$RB), + (TABORTWC (HTM_get_imm imm:$TO), $RA, $RB)>; + +def : Pat<(int_ppc_tabortwci i32:$TO, i32:$RA, i32:$SI), + (TABORTWCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>; + +def : Pat<(int_ppc_tabortdc i32:$TO, i32:$RA, i32:$RB), + (TABORTDC (HTM_get_imm imm:$TO), $RA, $RB)>; + +def : Pat<(int_ppc_tabortdci i32:$TO, i32:$RA, i32:$SI), + (TABORTDCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>; + +def : Pat<(int_ppc_tcheck), + (TCHECK_RET)>; + +def : Pat<(int_ppc_treclaim i32:$RA), + (TRECLAIM $RA)>; + +def : Pat<(int_ppc_trechkpt), + (TRECHKPT)>; + +def : Pat<(int_ppc_tsr i32:$L), + (TSR (HTM_get_imm imm:$L))>; + +def : Pat<(int_ppc_get_texasr), + (MFSPR8 130)>; + +def : Pat<(int_ppc_get_texasru), + (MFSPR8 131)>; + +def : Pat<(int_ppc_get_tfhar), + (MFSPR8 128)>; + +def : Pat<(int_ppc_get_tfiar), + (MFSPR8 129)>; + + +def : Pat<(int_ppc_set_texasr i64:$V), + (MTSPR8 130, $V)>; + +def : Pat<(int_ppc_set_texasru i64:$V), + (MTSPR8 131, $V)>; + +def : Pat<(int_ppc_set_tfhar i64:$V), + (MTSPR8 128, $V)>; + +def : Pat<(int_ppc_set_tfiar i64:$V), + (MTSPR8 129, $V)>; + + +// Extended mnemonics +def : Pat<(int_ppc_tendall), + (TEND 1)>; + +def : Pat<(int_ppc_tresume), + (TSR 1)>; + +def : Pat<(int_ppc_tsuspend), + (TSR 0)>; + +def : Pat<(i64 (int_ppc_ttest)), + (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>; + +} // [HasHTM] diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp new file mode 100644 index 0000000..c17603a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -0,0 +1,1828 @@ +//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCHazardRecognizers.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-instr-info" + +#define GET_INSTRMAP_INFO +#define GET_INSTRINFO_CTOR_DTOR +#include "PPCGenInstrInfo.inc" + +static cl:: +opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden, + cl::desc("Disable analysis for CTR loops")); + +static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt", +cl::desc("Disable compare instruction optimization"), cl::Hidden); + +static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy", +cl::desc("Causes the backend to crash instead of generating a nop VSX copy"), +cl::Hidden); + +static cl::opt<bool> +UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden, + cl::desc("Use the old (incorrect) instruction latency calculation")); + +// Pin the vtable to this file. +void PPCInstrInfo::anchor() {} + +PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) + : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + Subtarget(STI), RI(STI.getTargetMachine()) {} + +/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for +/// this target when scheduling the DAG. +ScheduleHazardRecognizer * +PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const { + unsigned Directive = + static_cast<const PPCSubtarget *>(STI)->getDarwinDirective(); + if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 || + Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) { + const InstrItineraryData *II = + static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData(); + return new ScoreboardHazardRecognizer(II, DAG); + } + + return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); +} + +/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer +/// to use for this target when scheduling the DAG. +ScheduleHazardRecognizer * +PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + unsigned Directive = + DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective(); + + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8) + return new PPCDispatchGroupSBHazardRecognizer(II, DAG); + + // Most subtargets use a PPC970 recognizer. + if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 && + Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) { + assert(DAG->TII && "No InstrInfo?"); + + return new PPCHazardRecognizer970(*DAG); + } + + return new ScoreboardHazardRecognizer(II, DAG); +} + +unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (!ItinData || UseOldLatencyCalc) + return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost); + + // The default implementation of getInstrLatency calls getStageLatency, but + // getStageLatency does not do the right thing for us. While we have + // itinerary, most cores are fully pipelined, and so the itineraries only + // express the first part of the pipeline, not every stage. Instead, we need + // to use the listed output operand cycle number (using operand 0 here, which + // is an output). + + unsigned Latency = 1; + unsigned DefClass = MI->getDesc().getSchedClass(); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef() || MO.isImplicit()) + continue; + + int Cycle = ItinData->getOperandCycle(DefClass, i); + if (Cycle < 0) + continue; + + Latency = std::max(Latency, (unsigned) Cycle); + } + + return Latency; +} + +int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { + int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx, + UseMI, UseIdx); + + if (!DefMI->getParent()) + return Latency; + + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + unsigned Reg = DefMO.getReg(); + + bool IsRegCR; + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + const MachineRegisterInfo *MRI = + &DefMI->getParent()->getParent()->getRegInfo(); + IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) || + MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass); + } else { + IsRegCR = PPC::CRRCRegClass.contains(Reg) || + PPC::CRBITRCRegClass.contains(Reg); + } + + if (UseMI->isBranch() && IsRegCR) { + if (Latency < 0) + Latency = getInstrLatency(ItinData, DefMI); + + // On some cores, there is an additional delay between writing to a condition + // register, and using it from a branch. + unsigned Directive = Subtarget.getDarwinDirective(); + switch (Directive) { + default: break; + case PPC::DIR_7400: + case PPC::DIR_750: + case PPC::DIR_970: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + Latency += 2; + break; + } + } + + return Latency; +} + +// This function does not list all associative and commutative operations, but +// only those worth feeding through the machine combiner in an attempt to +// reduce the critical path. Mostly, this means floating-point operations, +// because they have high latencies (compared to other operations, such and +// and/or, which are also associative and commutative, but have low latencies). +bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + // FP Add: + case PPC::FADD: + case PPC::FADDS: + // FP Multiply: + case PPC::FMUL: + case PPC::FMULS: + // Altivec Add: + case PPC::VADDFP: + // VSX Add: + case PPC::XSADDDP: + case PPC::XVADDDP: + case PPC::XVADDSP: + case PPC::XSADDSP: + // VSX Multiply: + case PPC::XSMULDP: + case PPC::XVMULDP: + case PPC::XVMULSP: + case PPC::XSMULSP: + // QPX Add: + case PPC::QVFADD: + case PPC::QVFADDS: + case PPC::QVFADDSs: + // QPX Multiply: + case PPC::QVFMUL: + case PPC::QVFMULS: + case PPC::QVFMULSs: + return true; + default: + return false; + } +} + +bool PPCInstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + // Using the machine combiner in this way is potentially expensive, so + // restrict to when aggressive optimizations are desired. + if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) + return false; + + // FP reassociation is only legal when we don't need strict IEEE semantics. + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + return false; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + +// Detect 32 -> 64-bit extensions where we may reuse the low sub-register. +bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: return false; + case PPC::EXTSW: + case PPC::EXTSW_32_64: + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SubIdx = PPC::sub_32; + return true; + } +} + +unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + // Note: This list must be kept consistent with LoadRegFromStackSlot. + switch (MI->getOpcode()) { + default: break; + case PPC::LD: + case PPC::LWZ: + case PPC::LFS: + case PPC::LFD: + case PPC::RESTORE_CR: + case PPC::RESTORE_CRBIT: + case PPC::LVX: + case PPC::LXVD2X: + case PPC::QVLFDX: + case PPC::QVLFSXs: + case PPC::QVLFDXb: + case PPC::RESTORE_VRSAVE: + // Check for the operands added by addFrameReference (the immediate is the + // offset which defaults to 0). + if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() && + MI->getOperand(2).isFI()) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + // Note: This list must be kept consistent with StoreRegToStackSlot. + switch (MI->getOpcode()) { + default: break; + case PPC::STD: + case PPC::STW: + case PPC::STFS: + case PPC::STFD: + case PPC::SPILL_CR: + case PPC::SPILL_CRBIT: + case PPC::STVX: + case PPC::STXVD2X: + case PPC::QVSTFDX: + case PPC::QVSTFSXs: + case PPC::QVSTFDXb: + case PPC::SPILL_VRSAVE: + // Check for the operands added by addFrameReference (the immediate is the + // offset which defaults to 0). + if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() && + MI->getOperand(2).isFI()) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + MachineFunction &MF = *MI->getParent()->getParent(); + + // Normal instructions can be commuted the obvious way. + if (MI->getOpcode() != PPC::RLWIMI && + MI->getOpcode() != PPC::RLWIMIo) + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a + // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because + // changing the relative order of the mask operands might change what happens + // to the high-bits of the mask (and, thus, the result). + + // Cannot commute if it has a non-zero rotate count. + if (MI->getOperand(3).getImm() != 0) + return nullptr; + + // If we have a zero rotate count, we have: + // M = mask(MB,ME) + // Op0 = (Op1 & ~M) | (Op2 & M) + // Change this to: + // M = mask((ME+1)&31, (MB-1)&31) + // Op0 = (Op2 & ~M) | (Op1 & M) + + // Swap op1/op2 + assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && + "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + unsigned SubReg1 = MI->getOperand(1).getSubReg(); + unsigned SubReg2 = MI->getOperand(2).getSubReg(); + bool Reg1IsKill = MI->getOperand(1).isKill(); + bool Reg2IsKill = MI->getOperand(2).isKill(); + bool ChangeReg0 = false; + // If machine instrs are no longer in two-address forms, update + // destination register as well. + if (Reg0 == Reg1) { + // Must be two address instruction! + assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) && + "Expecting a two-address instruction!"); + assert(MI->getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch"); + Reg2IsKill = false; + ChangeReg0 = true; + } + + // Masks. + unsigned MB = MI->getOperand(4).getImm(); + unsigned ME = MI->getOperand(5).getImm(); + + // We can't commute a trivial mask (there is no way to represent an all-zero + // mask). + if (MB == 0 && ME == 31) + return nullptr; + + if (NewMI) { + // Create a new instruction. + unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg(); + bool Reg0IsDead = MI->getOperand(0).isDead(); + return BuildMI(MF, MI->getDebugLoc(), MI->getDesc()) + .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead)) + .addReg(Reg2, getKillRegState(Reg2IsKill)) + .addReg(Reg1, getKillRegState(Reg1IsKill)) + .addImm((ME+1) & 31) + .addImm((MB-1) & 31); + } + + if (ChangeReg0) { + MI->getOperand(0).setReg(Reg2); + MI->getOperand(0).setSubReg(SubReg2); + } + MI->getOperand(2).setReg(Reg1); + MI->getOperand(1).setReg(Reg2); + MI->getOperand(2).setSubReg(SubReg1); + MI->getOperand(1).setSubReg(SubReg2); + MI->getOperand(2).setIsKill(Reg1IsKill); + MI->getOperand(1).setIsKill(Reg2IsKill); + + // Swap the mask around. + MI->getOperand(4).setImm((ME+1) & 31); + MI->getOperand(5).setImm((MB-1) & 31); + return MI; +} + +bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + // For VSX A-Type FMA instructions, it is the first two operands that can be + // commuted, however, because the non-encoded tied input operand is listed + // first, the operands to swap are actually the second and third. + + int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode()); + if (AltOpc == -1) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + + // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1 + // and SrcOpIdx2. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); +} + +void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // This function is used for scheduling, and the nop wanted here is the type + // that terminates dispatch groups on the POWER cores. + unsigned Directive = Subtarget.getDarwinDirective(); + unsigned Opcode; + switch (Directive) { + default: Opcode = PPC::NOP; break; + case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break; + case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break; + case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */ + } + + DebugLoc DL; + BuildMI(MBB, MI, DL, get(Opcode)); +} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(PPC::NOP); +} + +// Branch analysis. +// Note: If the condition register is set to CTR or CTR8 then this is a +// BDNZ (imm == 1) or BDZ (imm == 0) branch. +bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + bool isPPC64 = Subtarget.isPPC64(); + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return false; + + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastInst->getOpcode() == PPC::B) { + if (!LastInst->getOperand(0).isMBB()) + return true; + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastInst->getOpcode() == PPC::BCC) { + if (!LastInst->getOperand(2).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(2).getMBB(); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + return false; + } else if (LastInst->getOpcode() == PPC::BC) { + if (!LastInst->getOperand(1).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(LastInst->getOperand(0)); + return false; + } else if (LastInst->getOpcode() == PPC::BCn) { + if (!LastInst->getOperand(1).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET)); + Cond.push_back(LastInst->getOperand(0)); + return false; + } else if (LastInst->getOpcode() == PPC::BDNZ8 || + LastInst->getOpcode() == PPC::BDNZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; + } else if (LastInst->getOpcode() == PPC::BDZ8 || + LastInst->getOpcode() == PPC::BDZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; + } + + // Otherwise, don't know what this is. + return true; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && + isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with PPC::B and PPC:BCC, handle it. + if (SecondLastInst->getOpcode() == PPC::BCC && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(2).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(2).getMBB(); + Cond.push_back(SecondLastInst->getOperand(0)); + Cond.push_back(SecondLastInst->getOperand(1)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (SecondLastInst->getOpcode() == PPC::BC && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(1).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(SecondLastInst->getOperand(0)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (SecondLastInst->getOpcode() == PPC::BCn && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(1).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET)); + Cond.push_back(SecondLastInst->getOperand(0)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 || + SecondLastInst->getOpcode() == PPC::BDNZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 || + SecondLastInst->getOpcode() == PPC::BDZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two PPC:Bs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == PPC::B && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return 0; + + if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "PPC branch conditions have two components!"); + + bool isPPC64 = Subtarget.isPPC64(); + + // One-way branch. + if (!FBB) { + if (Cond.empty()) // Unconditional branch + BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB); + else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_SET) + BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) + BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + else // Conditional branch + BuildMI(&MBB, DL, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + return 1; + } + + // Two-way Conditional Branch. + if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_SET) + BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) + BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + else + BuildMI(&MBB, DL, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB); + return 2; +} + +// Select analysis. +bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, int &FalseCycles) const { + if (!Subtarget.hasISEL()) + return false; + + if (Cond.size() != 2) + return false; + + // If this is really a bdnz-like condition, then it cannot be turned into a + // select. + if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + return false; + + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // isel is for regular integer GPRs only. + if (!PPC::GPRCRegClass.hasSubClassEq(RC) && + !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) && + !PPC::G8RCRegClass.hasSubClassEq(RC) && + !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) + return false; + + // FIXME: These numbers are for the A2, how well they work for other cores is + // an open question. On the A2, the isel instruction has a 2-cycle latency + // but single-cycle throughput. These numbers are used in combination with + // the MispredictPenalty setting from the active SchedMachineModel. + CondCycles = 1; + TrueCycles = 1; + FalseCycles = 1; + + return true; +} + +void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc dl, + unsigned DestReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + assert(Cond.size() == 2 && + "PPC branch conditions have two components!"); + + assert(Subtarget.hasISEL() && + "Cannot insert select on target without ISEL support"); + + // Get the register classes. + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + assert(RC && "TrueReg and FalseReg must have overlapping register classes"); + + bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC); + assert((Is64Bit || + PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) && + "isel is for regular integer GPRs only"); + + unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL; + unsigned SelectPred = Cond[0].getImm(); + + unsigned SubIdx; + bool SwapOps; + switch (SelectPred) { + default: llvm_unreachable("invalid predicate for isel"); + case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; + case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; + case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; + case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; + case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; + case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; + case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; + case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; + case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break; + case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break; + } + + unsigned FirstReg = SwapOps ? FalseReg : TrueReg, + SecondReg = SwapOps ? TrueReg : FalseReg; + + // The first input register of isel cannot be r0. If it is a member + // of a register class that can be r0, then copy it first (the + // register allocator should eliminate the copy). + if (MRI.getRegClass(FirstReg)->contains(PPC::R0) || + MRI.getRegClass(FirstReg)->contains(PPC::X0)) { + const TargetRegisterClass *FirstRC = + MRI.getRegClass(FirstReg)->contains(PPC::X0) ? + &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass; + unsigned OldFirstReg = FirstReg; + FirstReg = MRI.createVirtualRegister(FirstRC); + BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg) + .addReg(OldFirstReg); + } + + BuildMI(MBB, MI, dl, get(OpCode), DestReg) + .addReg(FirstReg).addReg(SecondReg) + .addReg(Cond[1].getReg(), 0, SubIdx); +} + +static unsigned getCRBitValue(unsigned CRBit) { + unsigned Ret = 4; + if (CRBit == PPC::CR0LT || CRBit == PPC::CR1LT || + CRBit == PPC::CR2LT || CRBit == PPC::CR3LT || + CRBit == PPC::CR4LT || CRBit == PPC::CR5LT || + CRBit == PPC::CR6LT || CRBit == PPC::CR7LT) + Ret = 3; + if (CRBit == PPC::CR0GT || CRBit == PPC::CR1GT || + CRBit == PPC::CR2GT || CRBit == PPC::CR3GT || + CRBit == PPC::CR4GT || CRBit == PPC::CR5GT || + CRBit == PPC::CR6GT || CRBit == PPC::CR7GT) + Ret = 2; + if (CRBit == PPC::CR0EQ || CRBit == PPC::CR1EQ || + CRBit == PPC::CR2EQ || CRBit == PPC::CR3EQ || + CRBit == PPC::CR4EQ || CRBit == PPC::CR5EQ || + CRBit == PPC::CR6EQ || CRBit == PPC::CR7EQ) + Ret = 1; + if (CRBit == PPC::CR0UN || CRBit == PPC::CR1UN || + CRBit == PPC::CR2UN || CRBit == PPC::CR3UN || + CRBit == PPC::CR4UN || CRBit == PPC::CR5UN || + CRBit == PPC::CR6UN || CRBit == PPC::CR7UN) + Ret = 0; + + assert(Ret != 4 && "Invalid CR bit register"); + return Ret; +} + +void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // We can end up with self copies and similar things as a result of VSX copy + // legalization. Promote them here. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (PPC::F8RCRegClass.contains(DestReg) && + PPC::VSRCRegClass.contains(SrcReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && SrcReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + DestReg = SuperReg; + } else if (PPC::VRRCRegClass.contains(DestReg) && + PPC::VSRCRegClass.contains(SrcReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && SrcReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + DestReg = SuperReg; + } else if (PPC::F8RCRegClass.contains(SrcReg) && + PPC::VSRCRegClass.contains(DestReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && DestReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + SrcReg = SuperReg; + } else if (PPC::VRRCRegClass.contains(SrcReg) && + PPC::VSRCRegClass.contains(DestReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && DestReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + SrcReg = SuperReg; + } + + // Different class register copy + if (PPC::CRBITRCRegClass.contains(SrcReg) && + PPC::GPRCRegClass.contains(DestReg)) { + unsigned CRReg = getCRFromCRBit(SrcReg); + BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg) + .addReg(CRReg), getKillRegState(KillSrc); + // Rotate the CR bit in the CR fields to be the least significant bit and + // then mask with 0x1 (MB = ME = 31). + BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg) + .addReg(DestReg, RegState::Kill) + .addImm(TRI->getEncodingValue(CRReg) * 4 + (4 - getCRBitValue(SrcReg))) + .addImm(31) + .addImm(31); + return; + } else if (PPC::CRRCRegClass.contains(SrcReg) && + PPC::G8RCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg) + .addReg(SrcReg), getKillRegState(KillSrc); + return; + } else if (PPC::CRRCRegClass.contains(SrcReg) && + PPC::GPRCRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg) + .addReg(SrcReg), getKillRegState(KillSrc); + return; + } + + unsigned Opc; + if (PPC::GPRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::OR; + else if (PPC::G8RCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::OR8; + else if (PPC::F4RCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::FMR; + else if (PPC::CRRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::MCRF; + else if (PPC::VRRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::VOR; + else if (PPC::VSRCRegClass.contains(DestReg, SrcReg)) + // There are two different ways this can be done: + // 1. xxlor : This has lower latency (on the P7), 2 cycles, but can only + // issue in VSU pipeline 0. + // 2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but + // can go to either pipeline. + // We'll always use xxlor here, because in practically all cases where + // copies are generated, they are close enough to some use that the + // lower-latency form is preferable. + Opc = PPC::XXLOR; + else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || + PPC::VSSRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::XXLORf; + else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMR; + else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMRs; + else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMRb; + else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::CROR; + else + llvm_unreachable("Impossible reg-to-reg copy"); + + const MCInstrDesc &MCID = get(Opc); + if (MCID.getNumOperands() == 3) + BuildMI(MBB, I, DL, MCID, DestReg) + .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc)); + else + BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc)); +} + +// This function returns true if a CR spill is necessary and false otherwise. +bool +PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, + unsigned SrcReg, bool isKill, + int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const{ + // Note: If additional store instructions are added here, + // update isStoreToStackSlot. + + DebugLoc DL; + if (PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + return true; + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + return true; + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXVD2X)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXSDX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXSSPX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isDarwin() && + "VRSAVE only needs spill/restore on Darwin"); + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + SpillsVRS = true; + } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else { + llvm_unreachable("Unknown regclass!"); + } + + return false; +} + +void +PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + SmallVector<MachineInstr*, 4> NewMIs; + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasSpills(); + + bool NonRI = false, SpillsVRS = false; + if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs, + NonRI, SpillsVRS)) + FuncInfo->setSpillsCR(); + + if (SpillsVRS) + FuncInfo->setSpillsVRSAVE(); + + if (NonRI) + FuncInfo->setHasNonRISpills(); + + for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) + MBB.insert(MI, NewMIs[i]); + + const MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + NewMIs.back()->addMemOperand(MF, MMO); +} + +bool +PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const{ + // Note: If additional load instructions are added here, + // update isLoadFromStackSlot. + + if (PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ), + DestReg), FrameIdx)); + } else if (PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg), + FrameIdx)); + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg), + FrameIdx)); + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg), + FrameIdx)); + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_CR), DestReg), + FrameIdx)); + return true; + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_CRBIT), DestReg), + FrameIdx)); + return true; + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXVD2X), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXSDX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXSSPX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isDarwin() && + "VRSAVE only needs spill/restore on Darwin"); + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_VRSAVE), + DestReg), + FrameIdx)); + SpillsVRS = true; + } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg), + FrameIdx)); + NonRI = true; + } else { + llvm_unreachable("Unknown regclass!"); + } + + return false; +} + +void +PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + SmallVector<MachineInstr*, 4> NewMIs; + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasSpills(); + + bool NonRI = false, SpillsVRS = false; + if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs, + NonRI, SpillsVRS)) + FuncInfo->setSpillsCR(); + + if (SpillsVRS) + FuncInfo->setSpillsVRSAVE(); + + if (NonRI) + FuncInfo->setHasNonRISpills(); + + for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) + MBB.insert(MI, NewMIs[i]); + + const MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + NewMIs.back()->addMemOperand(MF, MMO); +} + +bool PPCInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Invalid PPC branch opcode!"); + if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR) + Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0); + else + // Leave the CR# the same, but invert the condition. + Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm())); + return false; +} + +bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + // For some instructions, it is legal to fold ZERO into the RA register field. + // A zero immediate should always be loaded with a single li. + unsigned DefOpc = DefMI->getOpcode(); + if (DefOpc != PPC::LI && DefOpc != PPC::LI8) + return false; + if (!DefMI->getOperand(1).isImm()) + return false; + if (DefMI->getOperand(1).getImm() != 0) + return false; + + // Note that we cannot here invert the arguments of an isel in order to fold + // a ZERO into what is presented as the second argument. All we have here + // is the condition bit, and that might come from a CR-logical bit operation. + + const MCInstrDesc &UseMCID = UseMI->getDesc(); + + // Only fold into real machine instructions. + if (UseMCID.isPseudo()) + return false; + + unsigned UseIdx; + for (UseIdx = 0; UseIdx < UseMI->getNumOperands(); ++UseIdx) + if (UseMI->getOperand(UseIdx).isReg() && + UseMI->getOperand(UseIdx).getReg() == Reg) + break; + + assert(UseIdx < UseMI->getNumOperands() && "Cannot find Reg in UseMI"); + assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg"); + + const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx]; + + // We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0 + // register (which might also be specified as a pointer class kind). + if (UseInfo->isLookupPtrRegClass()) { + if (UseInfo->RegClass /* Kind */ != 1) + return false; + } else { + if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID && + UseInfo->RegClass != PPC::G8RC_NOX0RegClassID) + return false; + } + + // Make sure this is not tied to an output register (or otherwise + // constrained). This is true for ST?UX registers, for example, which + // are tied to their output registers. + if (UseInfo->Constraints != 0) + return false; + + unsigned ZeroReg; + if (UseInfo->isLookupPtrRegClass()) { + bool isPPC64 = Subtarget.isPPC64(); + ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO; + } else { + ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ? + PPC::ZERO8 : PPC::ZERO; + } + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + UseMI->getOperand(UseIdx).setReg(ZeroReg); + + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; +} + +static bool MBBDefinesCTR(MachineBasicBlock &MBB) { + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) + if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8)) + return true; + return false; +} + +// We should make sure that, if we're going to predicate both sides of a +// condition (a diamond), that both sides don't define the counter register. We +// can predicate counter-decrement-based branches, but while that predicates +// the branching, it does not predicate the counter decrement. If we tried to +// merge the triangle into one predicated block, we'd decrement the counter +// twice. +bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + BranchProbability Probability) const { + return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB)); +} + + +bool PPCInstrInfo::isPredicated(const MachineInstr *MI) const { + // The predicated branches are identified by their type, not really by the + // explicit presence of a predicate. Furthermore, some of them can be + // predicated more than once. Because if conversion won't try to predicate + // any instruction which already claims to be predicated (by returning true + // here), always return false. In doing so, we let isPredicable() be the + // final word on whether not the instruction can be (further) predicated. + + return false; +} + +bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + if (!MI->isTerminator()) + return false; + + // Conditional branch is a special case. + if (MI->isBranch() && !MI->isBarrier()) + return true; + + return !isPredicated(MI); +} + +bool PPCInstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const { + unsigned OpC = MI->getOpcode(); + if (OpC == PPC::BLR || OpC == PPC::BLR8) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { + bool isPPC64 = Subtarget.isPPC64(); + MI->setDesc(get(Pred[0].getImm() ? + (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) : + (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); + } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MI->setDesc(get(PPC::BCLR)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MI->setDesc(get(PPC::BCLRn)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + } else { + MI->setDesc(get(PPC::BCCLR)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()); + } + + return true; + } else if (OpC == PPC::B) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { + bool isPPC64 = Subtarget.isPPC64(); + MI->setDesc(get(Pred[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))); + } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BC)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BCn)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } else { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BCC)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } + + return true; + } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || + OpC == PPC::BCTRL || OpC == PPC::BCTRL8) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) + llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); + + bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8; + bool isPPC64 = Subtarget.isPPC64(); + + if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : + (setLR ? PPC::BCCTRL : PPC::BCCTR))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + return true; + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) : + (setLR ? PPC::BCCTRLn : PPC::BCCTRn))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + return true; + } + + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) : + (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()); + return true; + } + + return false; +} + +bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + assert(Pred1.size() == 2 && "Invalid PPC first predicate"); + assert(Pred2.size() == 2 && "Invalid PPC second predicate"); + + if (Pred1[1].getReg() == PPC::CTR8 || Pred1[1].getReg() == PPC::CTR) + return false; + if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR) + return false; + + // P1 can only subsume P2 if they test the same condition register. + if (Pred1[1].getReg() != Pred2[1].getReg()) + return false; + + PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm(); + PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm(); + + if (P1 == P2) + return true; + + // Does P1 subsume P2, e.g. GE subsumes GT. + if (P1 == PPC::PRED_LE && + (P2 == PPC::PRED_LT || P2 == PPC::PRED_EQ)) + return true; + if (P1 == PPC::PRED_GE && + (P2 == PPC::PRED_GT || P2 == PPC::PRED_EQ)) + return true; + + return false; +} + +bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // Note: At the present time, the contents of Pred from this function is + // unused by IfConversion. This implementation follows ARM by pushing the + // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of + // predicate, instructions defining CTR or CTR8 are also included as + // predicate-defining instructions. + + const TargetRegisterClass *RCs[] = + { &PPC::CRRCRegClass, &PPC::CRBITRCRegClass, + &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass }; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) { + const TargetRegisterClass *RC = RCs[c]; + if (MO.isReg()) { + if (MO.isDef() && RC->contains(MO.getReg())) { + Pred.push_back(MO); + Found = true; + } + } else if (MO.isRegMask()) { + for (TargetRegisterClass::iterator I = RC->begin(), + IE = RC->end(); I != IE; ++I) + if (MO.clobbersPhysReg(*I)) { + Pred.push_back(MO); + Found = true; + } + } + } + } + + return Found; +} + +bool PPCInstrInfo::isPredicable(MachineInstr *MI) const { + unsigned OpC = MI->getOpcode(); + switch (OpC) { + default: + return false; + case PPC::B: + case PPC::BLR: + case PPC::BLR8: + case PPC::BCTR: + case PPC::BCTR8: + case PPC::BCTRL: + case PPC::BCTRL8: + return true; + } +} + +bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const { + unsigned Opc = MI->getOpcode(); + + switch (Opc) { + default: return false; + case PPC::CMPWI: + case PPC::CMPLWI: + case PPC::CMPDI: + case PPC::CMPLDI: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + Value = MI->getOperand(2).getImm(); + Mask = 0xFFFF; + return true; + case PPC::CMPW: + case PPC::CMPLW: + case PPC::CMPD: + case PPC::CMPLD: + case PPC::FCMPUS: + case PPC::FCMPUD: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + return true; + } +} + +bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr, + unsigned SrcReg, unsigned SrcReg2, + int Mask, int Value, + const MachineRegisterInfo *MRI) const { + if (DisableCmpOpt) + return false; + + int OpC = CmpInstr->getOpcode(); + unsigned CRReg = CmpInstr->getOperand(0).getReg(); + + // FP record forms set CR1 based on the execption status bits, not a + // comparison with zero. + if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD) + return false; + + // The record forms set the condition register based on a signed comparison + // with zero (so says the ISA manual). This is not as straightforward as it + // seems, however, because this is always a 64-bit comparison on PPC64, even + // for instructions that are 32-bit in nature (like slw for example). + // So, on PPC32, for unsigned comparisons, we can use the record forms only + // for equality checks (as those don't depend on the sign). On PPC64, + // we are restricted to equality for unsigned 64-bit comparisons and for + // signed 32-bit comparisons the applicability is more restricted. + bool isPPC64 = Subtarget.isPPC64(); + bool is32BitSignedCompare = OpC == PPC::CMPWI || OpC == PPC::CMPW; + bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW; + bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD; + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + int MIOpC = MI->getOpcode(); + + bool equalityOnly = false; + bool noSub = false; + if (isPPC64) { + if (is32BitSignedCompare) { + // We can perform this optimization only if MI is sign-extending. + if (MIOpC == PPC::SRAW || MIOpC == PPC::SRAWo || + MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo || + MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo || + MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo || + MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) { + noSub = true; + } else + return false; + } else if (is32BitUnsignedCompare) { + // We can perform this optimization, equality only, if MI is + // zero-extending. + if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo || + MIOpC == PPC::SLW || MIOpC == PPC::SLWo || + MIOpC == PPC::SRW || MIOpC == PPC::SRWo) { + noSub = true; + equalityOnly = true; + } else + return false; + } else + equalityOnly = is64BitUnsignedCompare; + } else + equalityOnly = is32BitUnsignedCompare; + + if (equalityOnly) { + // We need to check the uses of the condition register in order to reject + // non-equality comparisons. + for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg), + IE = MRI->use_instr_end(); I != IE; ++I) { + MachineInstr *UseMI = &*I; + if (UseMI->getOpcode() == PPC::BCC) { + unsigned Pred = UseMI->getOperand(0).getImm(); + if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE) + return false; + } else if (UseMI->getOpcode() == PPC::ISEL || + UseMI->getOpcode() == PPC::ISEL8) { + unsigned SubIdx = UseMI->getOperand(3).getSubReg(); + if (SubIdx != PPC::sub_eq) + return false; + } else + return false; + } + } + + MachineBasicBlock::iterator I = CmpInstr; + + // Scan forward to find the first use of the compare. + for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end(); + I != EL; ++I) { + bool FoundUse = false; + for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg), + JE = MRI->use_instr_end(); J != JE; ++J) + if (&*J == &*I) { + FoundUse = true; + break; + } + + if (FoundUse) + break; + } + + // There are two possible candidates which can be changed to set CR[01]. + // One is MI, the other is a SUB instruction. + // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). + MachineInstr *Sub = nullptr; + if (SrcReg2 != 0) + // MI is not a candidate for CMPrr. + MI = nullptr; + // FIXME: Conservatively refuse to convert an instruction which isn't in the + // same BB as the comparison. This is to allow the check below to avoid calls + // (and other explicit clobbers); instead we should really check for these + // more explicitly (in at least a few predecessors). + else if (MI->getParent() != CmpInstr->getParent() || Value != 0) { + // PPC does not have a record-form SUBri. + return false; + } + + // Search for Sub. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + --I; + + // Get ready to iterate backward from CmpInstr. + MachineBasicBlock::iterator E = MI, + B = CmpInstr->getParent()->begin(); + + for (; I != E && !noSub; --I) { + const MachineInstr &Instr = *I; + unsigned IOpC = Instr.getOpcode(); + + if (&*I != CmpInstr && ( + Instr.modifiesRegister(PPC::CR0, TRI) || + Instr.readsRegister(PPC::CR0, TRI))) + // This instruction modifies or uses the record condition register after + // the one we want to change. While we could do this transformation, it + // would likely not be profitable. This transformation removes one + // instruction, and so even forcing RA to generate one move probably + // makes it unprofitable. + return false; + + // Check whether CmpInstr can be made redundant by the current instruction. + if ((OpC == PPC::CMPW || OpC == PPC::CMPLW || + OpC == PPC::CMPD || OpC == PPC::CMPLD) && + (IOpC == PPC::SUBF || IOpC == PPC::SUBF8) && + ((Instr.getOperand(1).getReg() == SrcReg && + Instr.getOperand(2).getReg() == SrcReg2) || + (Instr.getOperand(1).getReg() == SrcReg2 && + Instr.getOperand(2).getReg() == SrcReg))) { + Sub = &*I; + break; + } + + if (I == B) + // The 'and' is below the comparison instruction. + return false; + } + + // Return false if no candidates exist. + if (!MI && !Sub) + return false; + + // The single candidate is called MI. + if (!MI) MI = Sub; + + int NewOpC = -1; + MIOpC = MI->getOpcode(); + if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8) + NewOpC = MIOpC; + else { + NewOpC = PPC::getRecordFormOpcode(MIOpC); + if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1) + NewOpC = MIOpC; + } + + // FIXME: On the non-embedded POWER architectures, only some of the record + // forms are fast, and we should use only the fast ones. + + // The defining instruction has a record form (or is already a record + // form). It is possible, however, that we'll need to reverse the condition + // code of the users. + if (NewOpC == -1) + return false; + + SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; + SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; + + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP + // needs to be updated to be based on SUB. Push the condition code + // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the + // condition code of these operands will be modified. + bool ShouldSwap = false; + if (Sub) { + ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg; + + // The operands to subf are the opposite of sub, so only in the fixed-point + // case, invert the order. + ShouldSwap = !ShouldSwap; + } + + if (ShouldSwap) + for (MachineRegisterInfo::use_instr_iterator + I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end(); + I != IE; ++I) { + MachineInstr *UseMI = &*I; + if (UseMI->getOpcode() == PPC::BCC) { + PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm(); + assert((!equalityOnly || + Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) && + "Invalid predicate for equality-only optimization"); + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::getSwappedPredicate(Pred))); + } else if (UseMI->getOpcode() == PPC::ISEL || + UseMI->getOpcode() == PPC::ISEL8) { + unsigned NewSubReg = UseMI->getOperand(3).getSubReg(); + assert((!equalityOnly || NewSubReg == PPC::sub_eq) && + "Invalid CR bit for equality-only optimization"); + + if (NewSubReg == PPC::sub_lt) + NewSubReg = PPC::sub_gt; + else if (NewSubReg == PPC::sub_gt) + NewSubReg = PPC::sub_lt; + + SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)), + NewSubReg)); + } else // We need to abort on a user we don't understand. + return false; + } + + // Create a new virtual register to hold the value of the CR set by the + // record-form instruction. If the instruction was not previously in + // record form, then set the kill flag on the CR. + CmpInstr->eraseFromParent(); + + MachineBasicBlock::iterator MII = MI; + BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(), + get(TargetOpcode::COPY), CRReg) + .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0); + + if (MIOpC != NewOpC) { + // We need to be careful here: we're replacing one instruction with + // another, and we need to make sure that we get all of the right + // implicit uses and defs. On the other hand, the caller may be holding + // an iterator to this instruction, and so we can't delete it (this is + // specifically the case if this is the instruction directly after the + // compare). + + const MCInstrDesc &NewDesc = get(NewOpC); + MI->setDesc(NewDesc); + + if (NewDesc.ImplicitDefs) + for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs(); + *ImpDefs; ++ImpDefs) + if (!MI->definesRegister(*ImpDefs)) + MI->addOperand(*MI->getParent()->getParent(), + MachineOperand::CreateReg(*ImpDefs, true, true)); + if (NewDesc.ImplicitUses) + for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses(); + *ImpUses; ++ImpUses) + if (!MI->readsRegister(*ImpUses)) + MI->addOperand(*MI->getParent()->getParent(), + MachineOperand::CreateReg(*ImpUses, false, true)); + } + + // Modify the condition code of operands in OperandsToUpdate. + // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to + // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++) + PredsToUpdate[i].first->setImm(PredsToUpdate[i].second); + + for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++) + SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second); + + return true; +} + +/// GetInstSize - Return the number of bytes of code the specified +/// instruction may be. This returns the maximum number of bytes. +/// +unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); + + if (Opcode == PPC::INLINEASM) { + const MachineFunction *MF = MI->getParent()->getParent(); + const char *AsmStr = MI->getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } else if (Opcode == TargetOpcode::STACKMAP) { + return MI->getOperand(1).getImm(); + } else if (Opcode == TargetOpcode::PATCHPOINT) { + PatchPointOpers Opers(MI); + return Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + } else { + const MCInstrDesc &Desc = get(Opcode); + return Desc.getSize(); + } +} + +std::pair<unsigned, unsigned> +PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = PPCII::MO_ACCESS_MASK; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_LO, "ppc-lo"}, + {MO_HA, "ppc-ha"}, + {MO_TPREL_LO, "ppc-tprel-lo"}, + {MO_TPREL_HA, "ppc-tprel-ha"}, + {MO_DTPREL_LO, "ppc-dtprel-lo"}, + {MO_TLSLD_LO, "ppc-tlsld-lo"}, + {MO_TOC_LO, "ppc-toc-lo"}, + {MO_TLS, "ppc-tls"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PLT_OR_STUB, "ppc-plt-or-stub"}, + {MO_PIC_FLAG, "ppc-pic"}, + {MO_NLP_FLAG, "ppc-nlp"}, + {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}}; + return makeArrayRef(TargetFlags); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h new file mode 100644 index 0000000..c3c3a48 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -0,0 +1,279 @@ +//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H +#define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H + +#include "PPC.h" +#include "PPCRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "PPCGenInstrInfo.inc" + +namespace llvm { + +/// PPCII - This namespace holds all of the PowerPC target-specific +/// per-instruction flags. These must match the corresponding definitions in +/// PPC.td and PPCInstrFormats.td. +namespace PPCII { +enum { + // PPC970 Instruction Flags. These flags describe the characteristics of the + // PowerPC 970 (aka G5) dispatch groups and how they are formed out of + // raw machine instructions. + + /// PPC970_First - This instruction starts a new dispatch group, so it will + /// always be the first one in the group. + PPC970_First = 0x1, + + /// PPC970_Single - This instruction starts a new dispatch group and + /// terminates it, so it will be the sole instruction in the group. + PPC970_Single = 0x2, + + /// PPC970_Cracked - This instruction is cracked into two pieces, requiring + /// two dispatch pipes to be available to issue. + PPC970_Cracked = 0x4, + + /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that + /// an instruction is issued to. + PPC970_Shift = 3, + PPC970_Mask = 0x07 << PPC970_Shift +}; +enum PPC970_Unit { + /// These are the various PPC970 execution unit pipelines. Each instruction + /// is one of these. + PPC970_Pseudo = 0 << PPC970_Shift, // Pseudo instruction + PPC970_FXU = 1 << PPC970_Shift, // Fixed Point (aka Integer/ALU) Unit + PPC970_LSU = 2 << PPC970_Shift, // Load Store Unit + PPC970_FPU = 3 << PPC970_Shift, // Floating Point Unit + PPC970_CRU = 4 << PPC970_Shift, // Control Register Unit + PPC970_VALU = 5 << PPC970_Shift, // Vector ALU + PPC970_VPERM = 6 << PPC970_Shift, // Vector Permute Unit + PPC970_BRU = 7 << PPC970_Shift // Branch Unit +}; +} // end namespace PPCII + +class PPCSubtarget; +class PPCInstrInfo : public PPCGenInstrInfo { + PPCSubtarget &Subtarget; + const PPCRegisterInfo RI; + + bool StoreRegToStackSlot(MachineFunction &MF, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const; + bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const; + virtual void anchor(); + +protected: + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + /// + /// For example, we can commute rlwimi instructions, but only if the + /// rotate amt is zero. We also have to munge the immediates a bit. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + +public: + explicit PPCInstrInfo(PPCSubtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const PPCRegisterInfo &getRegisterInfo() const { return RI; } + + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const override; + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const override { + return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx, + UseNode, UseIdx); + } + + bool hasLowDefLatency(const TargetSchedModel &SchedModel, + const MachineInstr *DefMI, + unsigned DefIdx) const override { + // Machine LICM should hoist all instructions in low-register-pressure + // situations; none are sufficiently free to justify leaving in a loop + // body. + return false; + } + + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// output in the <Pattern> array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &P) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + + bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const override; + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + + bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + + + // Branch analysis. + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + + // Select analysis. + bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int &, int &, int &) const override; + void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const override; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const override; + + // If conversion by predication (only supported by some branch instructions). + // All of the profitability checks always return true; it is always + // profitable to use the predicated branches. + bool isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + BranchProbability Probability) const override { + return true; + } + + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + BranchProbability Probability) const override; + + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override { + return true; + } + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override { + return false; + } + + // Predication support. + bool isPredicated(const MachineInstr *MI) const override; + + bool isUnpredicatedTerminator(const MachineInstr *MI) const override; + + bool PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const override; + + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + + bool isPredicable(MachineInstr *MI) const override; + + // Comparison optimization. + + + bool analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const override; + + bool optimizeCompareInstr(MachineInstr *CmpInstr, + unsigned SrcReg, unsigned SrcReg2, + int Mask, int Value, + const MachineRegisterInfo *MRI) const override; + + /// GetInstSize - Return the number of bytes of code the specified + /// instruction may be. This returns the maximum number of bytes. + /// + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + + void getNoopForMachoTarget(MCInst &NopInst) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td new file mode 100644 index 0000000..6c4364a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -0,0 +1,4127 @@ +//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the subset of the 32-bit PowerPC instruction set, as used +// by the PowerPC instruction selector. +// +//===----------------------------------------------------------------------===// + +include "PPCInstrFormats.td" + +//===----------------------------------------------------------------------===// +// PowerPC specific type constraints. +// +def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx + SDTCisVT<0, f64>, SDTCisPtrTy<1> +]>; +def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x + SDTCisVT<0, f64>, SDTCisPtrTy<1> +]>; + +def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; +def SDT_PPCvperm : SDTypeProfile<1, 3, [ + SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2> +]>; + +def SDT_PPCvcmp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> +]>; + +def SDT_PPCcondbr : SDTypeProfile<0, 3, [ + SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> +]>; + +def SDT_PPClbrx : SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> +]>; +def SDT_PPCstbrx : SDTypeProfile<0, 3, [ + SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> +]>; + +def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ + SDTCisPtrTy<0>, SDTCisVT<1, i32> +]>; + +def tocentry32 : Operand<iPTR> { + let MIOperandInfo = (ops i32imm:$imm); +} + +def SDT_PPCqvfperm : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3> +]>; +def SDT_PPCqvgpci : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisInt<1> +]>; +def SDT_PPCqvaligni : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3> +]>; +def SDT_PPCqvesplati : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2> +]>; + +def SDT_PPCqbflt : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisVec<1> +]>; + +def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; + +//===----------------------------------------------------------------------===// +// PowerPC specific DAG Nodes. +// + +def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; +def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; + +def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; +def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; +def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>; +def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>; +def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>; +def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>; +def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>; +def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>; +def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx, + [SDNPHasChain, SDNPMayStore]>; +def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx, + [SDNPHasChain, SDNPMayLoad]>; +def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx, + [SDNPHasChain, SDNPMayLoad]>; + +// Extract FPSCR (not modeled at the DAG level). +def PPCmffs : SDNode<"PPCISD::MFFS", + SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>; + +// Perform FADD in round-to-zero mode. +def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>; + + +def PPCfsel : SDNode<"PPCISD::FSEL", + // Type constraint for fsel. + SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisFP<0>, SDTCisVT<1, f64>]>, []>; + +def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>; +def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>; +def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, + [SDNPMayLoad, SDNPMemOperand]>; +def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>; +def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>; + +def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>; + +def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>; +def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp, + [SDNPMayLoad]>; +def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>; +def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>; +def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>; +def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>; +def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR", + SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>; +def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>; +def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>; +def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>; +def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR", + SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>; +def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>; +def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; + +def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; + +def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; +def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; +def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; +def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; + +def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; + +def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, + [SDNPHasChain, SDNPMayLoad]>; + +def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; + +// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift +// amounts. These nodes are generated by the multi-precision shift code. +def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>; +def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>; +def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>; + +// These are target-independent nodes, but have target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>; +def PPCcall : SDNode<"PPCISD::CALL", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_nop : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC", + SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def PPCeh_sjlj_setjmp : SDNode<"PPCISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc, + [SDNPHasChain, SDNPSideEffect]>; + +def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone, + [SDNPHasChain, SDNPSideEffect]>; +def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>; +def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc, + [SDNPHasChain, SDNPSideEffect]>; + +def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; +def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>; + +def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, + [SDNPHasChain, SDNPOptInGlue]>; + +def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, + [SDNPHasChain, SDNPMayLoad]>; +def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, + [SDNPHasChain, SDNPMayStore]>; + +// Instructions to set/unset CR bit 6 for SVR4 vararg calls +def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +// Instructions to support dynamic alloca. +def SDTDynOp : SDTypeProfile<1, 2, []>; +def SDTDynAreaOp : SDTypeProfile<1, 1, []>; +def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; +def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// PowerPC specific transformation functions and pattern fragments. +// + +def SHL32 : SDNodeXForm<imm, [{ + // Transformation function: 31 - imm + return getI32Imm(31 - N->getZExtValue(), SDLoc(N)); +}]>; + +def SRL32 : SDNodeXForm<imm, [{ + // Transformation function: 32 - imm + return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue(), SDLoc(N)) + : getI32Imm(0, SDLoc(N)); +}]>; + +def LO16 : SDNodeXForm<imm, [{ + // Transformation function: get the low 16 bits. + return getI32Imm((unsigned short)N->getZExtValue(), SDLoc(N)); +}]>; + +def HI16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned)N->getZExtValue() >> 16, SDLoc(N)); +}]>; + +def HA16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + signed int Val = N->getZExtValue(); + return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N)); +}]>; +def MB : SDNodeXForm<imm, [{ + // Transformation function: get the start bit of a mask + unsigned mb = 0, me; + (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + return getI32Imm(mb, SDLoc(N)); +}]>; + +def ME : SDNodeXForm<imm, [{ + // Transformation function: get the end bit of a mask + unsigned mb, me = 0; + (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + return getI32Imm(me, SDLoc(N)); +}]>; +def maskimm32 : PatLeaf<(imm), [{ + // maskImm predicate - True if immediate is a run of ones. + unsigned mb, me; + if (N->getValueType(0) == MVT::i32) + return isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + else + return false; +}]>; + +def imm32SExt16 : Operand<i32>, ImmLeaf<i32, [{ + // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit + // sign extended field. Used by instructions like 'addi'. + return (int32_t)Imm == (short)Imm; +}]>; +def imm64SExt16 : Operand<i64>, ImmLeaf<i64, [{ + // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit + // sign extended field. Used by instructions like 'addi'. + return (int64_t)Imm == (short)Imm; +}]>; +def immZExt16 : PatLeaf<(imm), [{ + // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended + // field. Used by instructions like 'ori'. + return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue(); +}], LO16>; + +// imm16Shifted* - These match immediates where the low 16-bits are zero. There +// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are +// identical in 32-bit mode, but in 64-bit mode, they return true if the +// immediate fits into a sign/zero extended 32-bit immediate (with the low bits +// clear). +def imm16ShiftedZExt : PatLeaf<(imm), [{ + // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'xoris'. + return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0; +}], HI16>; + +def imm16ShiftedSExt : PatLeaf<(imm), [{ + // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'addis'. Identical to + // imm16ShiftedZExt in 32-bit mode. + if (N->getZExtValue() & 0xFFFF) return false; + if (N->getValueType(0) == MVT::i32) + return true; + // For 64-bit, make sure it is sext right. + return N->getZExtValue() == (uint64_t)(int)N->getZExtValue(); +}], HI16>; + +def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{ + // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit + // zero extended field. + return isUInt<32>(Imm); +}]>; + +// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require +// restricted memrix (4-aligned) constants are alignment sensitive. If these +// offsets are hidden behind TOC entries than the values of the lower-order +// bits cannot be checked directly. As a result, we need to also incorporate +// an alignment check into the relevant patterns. + +def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4pre_store : PatFrag< + (ops node:$val, node:$base, node:$offset), + (pre_store node:$val, node:$base, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; + +def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4; +}]>; +def unaligned4store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() < 4; +}]>; +def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4; +}]>; + +//===----------------------------------------------------------------------===// +// PowerPC Flag Definitions. + +class isPPC64 { bit PPC64 = 1; } +class isDOT { bit RC = 1; } + +class RegConstraint<string C> { + string Constraints = C; +} +class NoEncode<string E> { + string DisableEncoding = E; +} + + +//===----------------------------------------------------------------------===// +// PowerPC Operand Definitions. + +// In the default PowerPC assembler syntax, registers are specified simply +// by number, so they cannot be distinguished from immediate values (without +// looking at the opcode). This means that the default operand matching logic +// for the asm parser does not work, and we need to specify custom matchers. +// Since those can only be specified with RegisterOperand classes and not +// directly on the RegisterClass, all instructions patterns used by the asm +// parser need to use a RegisterOperand (instead of a RegisterClass) for +// all their register operands. +// For this purpose, we define one RegisterOperand for each RegisterClass, +// using the same name as the class, just in lower case. + +def PPCRegGPRCAsmOperand : AsmOperandClass { + let Name = "RegGPRC"; let PredicateMethod = "isRegNumber"; +} +def gprc : RegisterOperand<GPRC> { + let ParserMatchClass = PPCRegGPRCAsmOperand; +} +def PPCRegG8RCAsmOperand : AsmOperandClass { + let Name = "RegG8RC"; let PredicateMethod = "isRegNumber"; +} +def g8rc : RegisterOperand<G8RC> { + let ParserMatchClass = PPCRegG8RCAsmOperand; +} +def PPCRegGPRCNoR0AsmOperand : AsmOperandClass { + let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber"; +} +def gprc_nor0 : RegisterOperand<GPRC_NOR0> { + let ParserMatchClass = PPCRegGPRCNoR0AsmOperand; +} +def PPCRegG8RCNoX0AsmOperand : AsmOperandClass { + let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber"; +} +def g8rc_nox0 : RegisterOperand<G8RC_NOX0> { + let ParserMatchClass = PPCRegG8RCNoX0AsmOperand; +} +def PPCRegF8RCAsmOperand : AsmOperandClass { + let Name = "RegF8RC"; let PredicateMethod = "isRegNumber"; +} +def f8rc : RegisterOperand<F8RC> { + let ParserMatchClass = PPCRegF8RCAsmOperand; +} +def PPCRegF4RCAsmOperand : AsmOperandClass { + let Name = "RegF4RC"; let PredicateMethod = "isRegNumber"; +} +def f4rc : RegisterOperand<F4RC> { + let ParserMatchClass = PPCRegF4RCAsmOperand; +} +def PPCRegVRRCAsmOperand : AsmOperandClass { + let Name = "RegVRRC"; let PredicateMethod = "isRegNumber"; +} +def vrrc : RegisterOperand<VRRC> { + let ParserMatchClass = PPCRegVRRCAsmOperand; +} +def PPCRegCRBITRCAsmOperand : AsmOperandClass { + let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber"; +} +def crbitrc : RegisterOperand<CRBITRC> { + let ParserMatchClass = PPCRegCRBITRCAsmOperand; +} +def PPCRegCRRCAsmOperand : AsmOperandClass { + let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber"; +} +def crrc : RegisterOperand<CRRC> { + let ParserMatchClass = PPCRegCRRCAsmOperand; +} +def crrc0 : RegisterOperand<CRRC0> { + let ParserMatchClass = PPCRegCRRCAsmOperand; +} + +def PPCU1ImmAsmOperand : AsmOperandClass { + let Name = "U1Imm"; let PredicateMethod = "isU1Imm"; + let RenderMethod = "addImmOperands"; +} +def u1imm : Operand<i32> { + let PrintMethod = "printU1ImmOperand"; + let ParserMatchClass = PPCU1ImmAsmOperand; +} + +def PPCU2ImmAsmOperand : AsmOperandClass { + let Name = "U2Imm"; let PredicateMethod = "isU2Imm"; + let RenderMethod = "addImmOperands"; +} +def u2imm : Operand<i32> { + let PrintMethod = "printU2ImmOperand"; + let ParserMatchClass = PPCU2ImmAsmOperand; +} + +def PPCU3ImmAsmOperand : AsmOperandClass { + let Name = "U3Imm"; let PredicateMethod = "isU3Imm"; + let RenderMethod = "addImmOperands"; +} +def u3imm : Operand<i32> { + let PrintMethod = "printU3ImmOperand"; + let ParserMatchClass = PPCU3ImmAsmOperand; +} + +def PPCU4ImmAsmOperand : AsmOperandClass { + let Name = "U4Imm"; let PredicateMethod = "isU4Imm"; + let RenderMethod = "addImmOperands"; +} +def u4imm : Operand<i32> { + let PrintMethod = "printU4ImmOperand"; + let ParserMatchClass = PPCU4ImmAsmOperand; +} +def PPCS5ImmAsmOperand : AsmOperandClass { + let Name = "S5Imm"; let PredicateMethod = "isS5Imm"; + let RenderMethod = "addImmOperands"; +} +def s5imm : Operand<i32> { + let PrintMethod = "printS5ImmOperand"; + let ParserMatchClass = PPCS5ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<5>"; +} +def PPCU5ImmAsmOperand : AsmOperandClass { + let Name = "U5Imm"; let PredicateMethod = "isU5Imm"; + let RenderMethod = "addImmOperands"; +} +def u5imm : Operand<i32> { + let PrintMethod = "printU5ImmOperand"; + let ParserMatchClass = PPCU5ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<5>"; +} +def PPCU6ImmAsmOperand : AsmOperandClass { + let Name = "U6Imm"; let PredicateMethod = "isU6Imm"; + let RenderMethod = "addImmOperands"; +} +def u6imm : Operand<i32> { + let PrintMethod = "printU6ImmOperand"; + let ParserMatchClass = PPCU6ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<6>"; +} +def PPCU10ImmAsmOperand : AsmOperandClass { + let Name = "U10Imm"; let PredicateMethod = "isU10Imm"; + let RenderMethod = "addImmOperands"; +} +def u10imm : Operand<i32> { + let PrintMethod = "printU10ImmOperand"; + let ParserMatchClass = PPCU10ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<10>"; +} +def PPCU12ImmAsmOperand : AsmOperandClass { + let Name = "U12Imm"; let PredicateMethod = "isU12Imm"; + let RenderMethod = "addImmOperands"; +} +def u12imm : Operand<i32> { + let PrintMethod = "printU12ImmOperand"; + let ParserMatchClass = PPCU12ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<12>"; +} +def PPCS16ImmAsmOperand : AsmOperandClass { + let Name = "S16Imm"; let PredicateMethod = "isS16Imm"; + let RenderMethod = "addS16ImmOperands"; +} +def s16imm : Operand<i32> { + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS16ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def PPCU16ImmAsmOperand : AsmOperandClass { + let Name = "U16Imm"; let PredicateMethod = "isU16Imm"; + let RenderMethod = "addU16ImmOperands"; +} +def u16imm : Operand<i32> { + let PrintMethod = "printU16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCU16ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<16>"; +} +def PPCS17ImmAsmOperand : AsmOperandClass { + let Name = "S17Imm"; let PredicateMethod = "isS17Imm"; + let RenderMethod = "addS16ImmOperands"; +} +def s17imm : Operand<i32> { + // This operand type is used for addis/lis to allow the assembler parser + // to accept immediates in the range -65536..65535 for compatibility with + // the GNU assembler. The operand is treated as 16-bit otherwise. + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS17ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def PPCDirectBrAsmOperand : AsmOperandClass { + let Name = "DirectBr"; let PredicateMethod = "isDirectBr"; + let RenderMethod = "addBranchTargetOperands"; +} +def directbrtarget : Operand<OtherVT> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def absdirectbrtarget : Operand<OtherVT> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def PPCCondBrAsmOperand : AsmOperandClass { + let Name = "CondBr"; let PredicateMethod = "isCondBr"; + let RenderMethod = "addBranchTargetOperands"; +} +def condbrtarget : Operand<OtherVT> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getCondBrEncoding"; + let ParserMatchClass = PPCCondBrAsmOperand; +} +def abscondbrtarget : Operand<OtherVT> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsCondBrEncoding"; + let ParserMatchClass = PPCCondBrAsmOperand; +} +def calltarget : Operand<iPTR> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def abscalltarget : Operand<iPTR> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def PPCCRBitMaskOperand : AsmOperandClass { + let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask"; +} +def crbitm: Operand<i8> { + let PrintMethod = "printcrbitm"; + let EncoderMethod = "get_crbitm_encoding"; + let DecoderMethod = "decodeCRBitMOperand"; + let ParserMatchClass = PPCCRBitMaskOperand; +} +// Address operands +// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode). +def PPCRegGxRCNoR0Operand : AsmOperandClass { + let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber"; +} +def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> { + let ParserMatchClass = PPCRegGxRCNoR0Operand; +} +// A version of ptr_rc usable with the asm parser. +def PPCRegGxRCOperand : AsmOperandClass { + let Name = "RegGxRC"; let PredicateMethod = "isRegNumber"; +} +def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> { + let ParserMatchClass = PPCRegGxRCOperand; +} + +def PPCDispRIOperand : AsmOperandClass { + let Name = "DispRI"; let PredicateMethod = "isS16Imm"; + let RenderMethod = "addS16ImmOperands"; +} +def dispRI : Operand<iPTR> { + let ParserMatchClass = PPCDispRIOperand; +} +def PPCDispRIXOperand : AsmOperandClass { + let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4"; + let RenderMethod = "addImmOperands"; +} +def dispRIX : Operand<iPTR> { + let ParserMatchClass = PPCDispRIXOperand; +} +def PPCDispSPE8Operand : AsmOperandClass { + let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8"; + let RenderMethod = "addImmOperands"; +} +def dispSPE8 : Operand<iPTR> { + let ParserMatchClass = PPCDispSPE8Operand; +} +def PPCDispSPE4Operand : AsmOperandClass { + let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4"; + let RenderMethod = "addImmOperands"; +} +def dispSPE4 : Operand<iPTR> { + let ParserMatchClass = PPCDispSPE4Operand; +} +def PPCDispSPE2Operand : AsmOperandClass { + let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2"; + let RenderMethod = "addImmOperands"; +} +def dispSPE2 : Operand<iPTR> { + let ParserMatchClass = PPCDispSPE2Operand; +} + +def memri : Operand<iPTR> { + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getMemRIEncoding"; + let DecoderMethod = "decodeMemRIOperands"; +} +def memrr : Operand<iPTR> { + let PrintMethod = "printMemRegReg"; + let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg); +} +def memrix : Operand<iPTR> { // memri where the imm is 4-aligned. + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getMemRIXEncoding"; + let DecoderMethod = "decodeMemRIXOperands"; +} +def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned. + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getSPE8DisEncoding"; +} +def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned. + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getSPE4DisEncoding"; +} +def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned. + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getSPE2DisEncoding"; +} + +// A single-register address. This is used with the SjLj +// pseudo-instructions. +def memr : Operand<iPTR> { + let MIOperandInfo = (ops ptr_rc:$ptrreg); +} +def PPCTLSRegOperand : AsmOperandClass { + let Name = "TLSReg"; let PredicateMethod = "isTLSReg"; + let RenderMethod = "addTLSRegOperands"; +} +def tlsreg32 : Operand<i32> { + let EncoderMethod = "getTLSRegEncoding"; + let ParserMatchClass = PPCTLSRegOperand; +} +def tlsgd32 : Operand<i32> {} +def tlscall32 : Operand<i32> { + let PrintMethod = "printTLSCall"; + let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym); + let EncoderMethod = "getTLSCallEncoding"; +} + +// PowerPC Predicate operand. +def pred : Operand<OtherVT> { + let PrintMethod = "printPredicateOperand"; + let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg); +} + +// Define PowerPC specific addressing mode. +def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; +def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; +def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>; +def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" + +// The address in a single register. This is used with the SjLj +// pseudo-instructions. +def addr : ComplexPattern<iPTR, 1, "SelectAddr",[], []>; + +/// This is just the offset part of iaddr, used for preinc. +def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>; + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Predicate Definitions. +def In32BitMode : Predicate<"!PPCSubTarget->isPPC64()">; +def In64BitMode : Predicate<"PPCSubTarget->isPPC64()">; +def IsBookE : Predicate<"PPCSubTarget->isBookE()">; +def IsNotBookE : Predicate<"!PPCSubTarget->isBookE()">; +def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">; +def HasSYNC : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">; +def IsPPC4xx : Predicate<"PPCSubTarget->isPPC4xx()">; +def IsPPC6xx : Predicate<"PPCSubTarget->isPPC6xx()">; +def IsE500 : Predicate<"PPCSubTarget->isE500()">; +def HasSPE : Predicate<"PPCSubTarget->HasSPE()">; +def HasICBT : Predicate<"PPCSubTarget->hasICBT()">; +def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">; +def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">; +def NaNsFPMath : Predicate<"!TM.Options.NoNaNsFPMath">; +def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">; +def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">; + +//===----------------------------------------------------------------------===// +// PowerPC Multiclass Definitions. + +multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XForm_10<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XForm_10<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_11<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XForm_11<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +// Multiclass for instructions for which the non record form is not cracked +// and the record form is cracked (i.e. divw, mullw, etc.) +multiclass XOForm_1rcr<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel, PPC970_DGroup_First, + PPC970_DGroup_Cracked; + } +} + +multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MForm_2<opcode, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MForm_2<opcode, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MDForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MDForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MDSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MDSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_26<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : XForm_26<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_28<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : XForm_28<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_2<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_2<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_3<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_3<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Definitions. + +// Pseudo-instructions: + +let hasCtrlDep = 1 in { +let Defs = [R1], Uses = [R1] in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt", + [(callseq_start timm:$amt)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", + [(callseq_end timm:$amt1, timm:$amt2)]>; +} + +def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS), + "UPDATE_VRSAVE $rD, $rS", []>; +} + +let Defs = [R1], Uses = [R1] in +def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", + [(set i32:$result, + (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", + [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; + +// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after +// instruction selection into a branch sequence. +let usesCustomInserter = 1, // Expanded after instruction selection. + PPC970_Single = 1 in { + // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes + // because either operand might become the first operand in an isel, and + // that operand cannot be r0. + def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond, + gprc_nor0:$T, gprc_nor0:$F, + i32imm:$BROPC), "#SELECT_CC_I4", + []>; + def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond, + g8rc_nox0:$T, g8rc_nox0:$F, + i32imm:$BROPC), "#SELECT_CC_I8", + []>; + def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, + i32imm:$BROPC), "#SELECT_CC_F4", + []>; + def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, + i32imm:$BROPC), "#SELECT_CC_F8", + []>; + def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, + i32imm:$BROPC), "#SELECT_CC_VRRC", + []>; + + // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition + // register bit directly. + def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond, + gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4", + [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>; + def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond, + g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8", + [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>; + def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond, + f4rc:$T, f4rc:$F), "#SELECT_F4", + [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; + def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond, + f8rc:$T, f8rc:$F), "#SELECT_F8", + [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; + def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond, + vrrc:$T, vrrc:$F), "#SELECT_VRRC", + [(set v4i32:$dst, + (select i1:$cond, v4i32:$T, v4i32:$F))]>; +} + +// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to +// scavenge a register for it. +let mayStore = 1 in { +def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F), + "#SPILL_CR", []>; +def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F), + "#SPILL_CRBIT", []>; +} + +// RESTORE_CR - Indicate that we're restoring the CR register (previously +// spilled), so we'll need to scavenge a register for it. +let mayLoad = 1 in { +def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F), + "#RESTORE_CR", []>; +def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F), + "#RESTORE_CRBIT", []>; +} + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { + let isReturn = 1, Uses = [LR, RM] in + def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, + [(retflag)]>, Requires<[In32BitMode]>; + let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in { + def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>; + + let isCodeGenOnly = 1 in { + def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), + "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCCTR : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi), + "bcctr 12, $bi, 0", IIC_BrB, []>; + def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi), + "bcctr 4, $bi, 0", IIC_BrB, []>; + } + } +} + +let Defs = [LR] in + def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>, + PPC970_Unit_BRU; +let Defs = [LR] in + def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>, + PPC970_Unit_BRU; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { + let isBarrier = 1 in { + def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst), + "b $dst", IIC_BrB, + [(br bb:$dst)]>; + def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst), + "ba $dst", IIC_BrB, []>; + } + + // BCC represents an arbitrary conditional branch on a predicate. + // FIXME: should be able to write a pattern for PPCcondbranch, but can't use + // a two-value operand where a dag node expects two operands. :( + let isCodeGenOnly = 1 in { + def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}${cond:pm} ${cond:reg}, $dst" + /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst), + "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">; + + let isReturn = 1, Uses = [LR, RM] in + def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond), + "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>; + } + + let isCodeGenOnly = 1 in { + let Pattern = [(brcond i1:$bi, bb:$dst)] in + def BC : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), + "bc 12, $bi, $dst">; + + let Pattern = [(brcond (not i1:$bi), bb:$dst)] in + def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), + "bc 4, $bi, $dst">; + + let isReturn = 1, Uses = [LR, RM] in + def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi), + "bclr 12, $bi, 0", IIC_BrB, []>; + def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi), + "bclr 4, $bi, 0", IIC_BrB, []>; + } + + let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in { + def BDZLR : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins), + "bdzlr", IIC_BrB, []>; + def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins), + "bdnzlr", IIC_BrB, []>; + def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins), + "bdzlr+", IIC_BrB, []>; + def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins), + "bdnzlr+", IIC_BrB, []>; + def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins), + "bdzlr-", IIC_BrB, []>; + def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins), + "bdnzlr-", IIC_BrB, []>; + } + + let Defs = [CTR], Uses = [CTR] in { + def BDZ : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst">; + def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst">; + def BDZA : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza $dst">; + def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza $dst">; + def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz+ $dst">; + def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz+ $dst">; + def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza+ $dst">; + def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza+ $dst">; + def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz- $dst">; + def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz- $dst">; + def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza- $dst">; + def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza- $dst">; + } +} + +// The unconditional BCL used by the SjLj setjmp code. +let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in { + let Defs = [LR], Uses = [RM] in { + def BCLalways : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst), + "bcl 20, 31, $dst">; + } +} + +let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>; + + let isCodeGenOnly = 1 in { + def BL_TLS : IForm<18, 0, 1, (outs), (ins tlscall32:$func), + "bl $func", IIC_BrB, []>; + def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">; + def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst), + "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">; + + def BCL : BForm_4<16, 12, 0, 1, (outs), + (ins crbitrc:$bi, condbrtarget:$dst), + "bcl 12, $bi, $dst">; + def BCLn : BForm_4<16, 4, 0, 1, (outs), + (ins crbitrc:$bi, condbrtarget:$dst), + "bcl 4, $bi, $dst">; + } + } + let Uses = [CTR, RM] in { + def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In32BitMode]>; + + let isCodeGenOnly = 1 in { + def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), + "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCCTRL : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi), + "bcctrl 12, $bi, 0", IIC_BrB, []>; + def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi), + "bcctrl 4, $bi, 0", IIC_BrB, []>; + } + } + let Uses = [LR, RM] in { + def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins), + "blrl", IIC_BrB, []>; + + let isCodeGenOnly = 1 in { + def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond), + "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCLRL : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi), + "bclrl 12, $bi, 0", IIC_BrB, []>; + def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi), + "bclrl 4, $bi, 0", IIC_BrB, []>; + } + } + let Defs = [CTR], Uses = [CTR, RM] in { + def BDZL : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl $dst">; + def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl $dst">; + def BDZLA : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla $dst">; + def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla $dst">; + def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl+ $dst">; + def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl+ $dst">; + def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla+ $dst">; + def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla+ $dst">; + def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl- $dst">; + def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl- $dst">; + def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla- $dst">; + def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla- $dst">; + } + let Defs = [CTR], Uses = [CTR, LR, RM] in { + def BDZLRL : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins), + "bdzlrl", IIC_BrB, []>; + def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins), + "bdnzlrl", IIC_BrB, []>; + def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins), + "bdzlrl+", IIC_BrB, []>; + def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins), + "bdnzlrl+", IIC_BrB, []>; + def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins), + "bdzlrl-", IIC_BrB, []>; + def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins), + "bdnzlrl-", IIC_BrB, []>; + } +} + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNdi :Pseudo< (outs), + (ins calltarget:$dst, i32imm:$offset), + "#TC_RETURNd $dst $offset", + []>; + + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), + "#TC_RETURNa $func $offset", + [(PPCtc_return (i32 imm:$func), imm:$offset)]>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), + "#TC_RETURNr $dst $offset", + []>; + + +let isCodeGenOnly = 1 in { + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, + isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in +def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, Requires<[In32BitMode]>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILB : IForm<18, 0, 0, (outs), (ins calltarget:$dst), + "b $dst", IIC_BrB, + []>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst), + "ba $dst", IIC_BrB, + []>; + +} + +let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + let Defs = [CTR] in + def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + "#EH_SJLJ_SETJMP32", + [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, + Requires<[In32BitMode]>; + let isTerminator = 1 in + def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf), + "#EH_SJLJ_LONGJMP32", + [(PPCeh_sjlj_longjmp addr:$buf)]>, + Requires<[In32BitMode]>; +} + +let isBranch = 1, isTerminator = 1 in { + def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + +// System call. +let PPC970_Unit = 7 in { + def SC : SCForm<17, 1, (outs), (ins i32imm:$lev), + "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>; +} + +// Branch history rolling buffer. +def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, + [(PPCclrbhrb)]>, + PPC970_DGroup_Single; +// The $dmy argument used for MFBHRBE is not needed; however, including +// it avoids automatic generation of PPCFastISel::fastEmit_i(), which +// interferes with necessary special handling (see PPCFastISel.cpp). +def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$rD), + (ins u10imm:$imm, u10imm:$dmy), + "mfbhrbe $rD, $imm", IIC_BrB, + [(set i32:$rD, + (PPCmfbhrbe imm:$imm, imm:$dmy))]>, + PPC970_DGroup_First; + +def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm", + IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>, + PPC970_DGroup_Single; + +// DCB* instructions. +def DCBA : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst", + IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBF : DCB_Form<86, 0, (outs), (ins memrr:$dst), "dcbf $dst", + IIC_LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBI : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst", + IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBST : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst", + IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZ : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst", + IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst", + IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>, + PPC970_DGroup_Single; + +let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in { +def DCBT : DCB_Form_hint<278, (outs), (ins u5imm:$TH, memrr:$dst), + "dcbt $dst, $TH", IIC_LdStDCBF, []>, + PPC970_DGroup_Single; +def DCBTST : DCB_Form_hint<246, (outs), (ins u5imm:$TH, memrr:$dst), + "dcbtst $dst, $TH", IIC_LdStDCBF, []>, + PPC970_DGroup_Single; +} // hasSideEffects = 0 + +def ICBT : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src), + "icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>; + +def : Pat<(int_ppc_dcbt xoaddr:$dst), + (DCBT 0, xoaddr:$dst)>; +def : Pat<(int_ppc_dcbtst xoaddr:$dst), + (DCBTST 0, xoaddr:$dst)>; + +def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)), + (DCBT 0, xoaddr:$dst)>; // data prefetch for loads +def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)), + (DCBTST 0, xoaddr:$dst)>; // data prefetch for stores +def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)), + (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read) + +// Atomic operations +let usesCustomInserter = 1 in { + let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8", + [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8", + [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8", + [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8", + [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8", + [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8", + [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16", + [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16", + [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16", + [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16", + [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16", + [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16", + [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32", + [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32", + [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32", + [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32", + [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32", + [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32", + [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>; + + def ATOMIC_CMP_SWAP_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8", + [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>; + + def ATOMIC_SWAP_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8", + [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16", + [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32", + [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>; + } +} + +// Instructions to support atomic operations +let mayLoad = 1, hasSideEffects = 0 in { +def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), + "lbarx $rD, $src", IIC_LdStLWARX, []>, + Requires<[HasPartwordAtomics]>; + +def LHARX : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src), + "lharx $rD, $src", IIC_LdStLWARX, []>, + Requires<[HasPartwordAtomics]>; + +def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src), + "lwarx $rD, $src", IIC_LdStLWARX, []>; + +// Instructions to support lock versions of atomics +// (EH=1 - see Power ISA 2.07 Book II 4.4.2) +def LBARXL : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), + "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + Requires<[HasPartwordAtomics]>; + +def LHARXL : XForm_1<31, 116, (outs gprc:$rD), (ins memrr:$src), + "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + Requires<[HasPartwordAtomics]>; + +def LWARXL : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src), + "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT; +} + +let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in { +def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst), + "stbcx. $rS, $dst", IIC_LdStSTWCX, []>, + isDOT, Requires<[HasPartwordAtomics]>; + +def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst), + "sthcx. $rS, $dst", IIC_LdStSTWCX, []>, + isDOT, Requires<[HasPartwordAtomics]>; + +def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst), + "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT; +} + +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in +def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>; + +def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm), + "twi $to, $rA, $imm", IIC_IntTrapW, []>; +def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB), + "tw $to, $rA, $rB", IIC_IntTrapW, []>; +def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm), + "tdi $to, $rA, $imm", IIC_IntTrapD, []>; +def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB), + "td $to, $rA, $rB", IIC_IntTrapD, []>; + +//===----------------------------------------------------------------------===// +// PPC32 Load Instructions. +// + +// Unindexed (r+i) Loads. +let PPC970_Unit = 2 in { +def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src), + "lbz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi8 iaddr:$src))]>; +def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src), + "lha $rD, $src", IIC_LdStLHA, + [(set i32:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src), + "lhz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src), + "lwz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (load iaddr:$src))]>; + +def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src), + "lfs $rD, $src", IIC_LdStLFD, + [(set f32:$rD, (load iaddr:$src))]>; +def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src), + "lfd $rD, $src", IIC_LdStLFD, + [(set f64:$rD, (load iaddr:$src))]>; + + +// Unindexed (r+i) Loads with Update (preinc). +let mayLoad = 1, hasSideEffects = 0 in { +def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lbzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhau $rD, $addr", IIC_LdStLHAU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lwzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lfsu $rD, $addr", IIC_LdStLFDU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lfdu $rD, $addr", IIC_LdStLFDU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + + +// Indexed (r+r) Loads with Update (preinc). +def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lfsux $rD, $addr", IIC_LdStLFDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lfdux $rD, $addr", IIC_LdStLFDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +} +} + +// Indexed (r+r) Loads. +// +let PPC970_Unit = 2 in { +def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src), + "lbzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi8 xaddr:$src))]>; +def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src), + "lhax $rD, $src", IIC_LdStLHA, + [(set i32:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src), + "lhzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src), + "lwzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (load xaddr:$src))]>; + + +def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src), + "lhbrx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>; +def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src), + "lwbrx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>; + +def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src), + "lfsx $frD, $src", IIC_LdStLFD, + [(set f32:$frD, (load xaddr:$src))]>; +def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src), + "lfdx $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (load xaddr:$src))]>; + +def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src), + "lfiwax $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (PPClfiwax xoaddr:$src))]>; +def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src), + "lfiwzx $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>; +} + +// Load Multiple +def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src), + "lmw $rD, $src", IIC_LdStLMW, []>; + +//===----------------------------------------------------------------------===// +// PPC32 Store Instructions. +// + +// Unindexed (r+i) Stores. +let PPC970_Unit = 2 in { +def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src), + "stb $rS, $src", IIC_LdStStore, + [(truncstorei8 i32:$rS, iaddr:$src)]>; +def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src), + "sth $rS, $src", IIC_LdStStore, + [(truncstorei16 i32:$rS, iaddr:$src)]>; +def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src), + "stw $rS, $src", IIC_LdStStore, + [(store i32:$rS, iaddr:$src)]>; +def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst), + "stfs $rS, $dst", IIC_LdStSTFD, + [(store f32:$rS, iaddr:$dst)]>; +def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst), + "stfd $rS, $dst", IIC_LdStSTFD, + [(store f64:$rS, iaddr:$dst)]>; +} + +// Unindexed (r+i) Stores with Update (preinc). +let PPC970_Unit = 2, mayStore = 1 in { +def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst), + "stfsu $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst), + "stfdu $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STBU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STHU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STWU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STFSU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STFDU $rS, iaddroff:$ptroff, $ptrreg)>; + +// Indexed (r+r) Stores. +let PPC970_Unit = 2 in { +def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst), + "stbx $rS, $dst", IIC_LdStStore, + [(truncstorei8 i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst), + "sthx $rS, $dst", IIC_LdStStore, + [(truncstorei16 i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst), + "stwx $rS, $dst", IIC_LdStStore, + [(store i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; + +def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst), + "sthbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>, + PPC970_DGroup_Cracked; +def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst), + "stwbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>, + PPC970_DGroup_Cracked; + +def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst), + "stfiwx $frS, $dst", IIC_LdStSTFD, + [(PPCstfiwx f64:$frS, xoaddr:$dst)]>; + +def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst), + "stfsx $frS, $dst", IIC_LdStSTFD, + [(store f32:$frS, xaddr:$dst)]>; +def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst), + "stfdx $frS, $dst", IIC_LdStSTFD, + [(store f64:$frS, xaddr:$dst)]>; +} + +// Indexed (r+r) Stores with Update (preinc). +let PPC970_Unit = 2, mayStore = 1 in { +def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst), + "stfsux $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst), + "stfdux $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STBUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STHUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STWUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STFSUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STFDUX $rS, $ptrreg, $ptroff)>; + +// Store Multiple +def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst), + "stmw $rS, $dst", IIC_LdStLMW, []>; + +def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L), + "sync $L", IIC_LdStSync, []>; + +let isCodeGenOnly = 1 in { + def MSYNC : XForm_24_sync<31, 598, (outs), (ins), + "msync", IIC_LdStSync, []> { + let L = 0; + } +} + +def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[HasSYNC]>; +def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>; +def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[HasOnlyMSYNC]>; +def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>; + +//===----------------------------------------------------------------------===// +// PPC32 Arithmetic Instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. +def ADDI : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm), + "addi $rD, $rA, $imm", IIC_IntSimple, + [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>; +let BaseName = "addic" in { +let Defs = [CARRY] in +def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "addic $rD, $rA, $imm", IIC_IntGeneral, + [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>, + RecFormRel, PPC970_DGroup_Cracked; +let Defs = [CARRY, CR0] in +def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "addic. $rD, $rA, $imm", IIC_IntGeneral, + []>, isDOT, RecFormRel; +} +def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm), + "addis $rD, $rA, $imm", IIC_IntSimple, + [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>; +let isCodeGenOnly = 1 in +def LA : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym), + "la $rD, $sym($rA)", IIC_IntGeneral, + [(set i32:$rD, (add i32:$rA, + (PPClo tglobaladdr:$sym, 0)))]>; +def MULLI : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "mulli $rD, $rA, $imm", IIC_IntMulLI, + [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>; +let Defs = [CARRY] in +def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "subfic $rD, $rA, $imm", IIC_IntGeneral, + [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>; + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { + def LI : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm), + "li $rD, $imm", IIC_IntSimple, + [(set i32:$rD, imm32SExt16:$imm)]>; + def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm), + "lis $rD, $imm", IIC_IntSimple, + [(set i32:$rD, imm16ShiftedSExt:$imm)]>; +} +} + +let PPC970_Unit = 1 in { // FXU Operations. +let Defs = [CR0] in { +def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "andi. $dst, $src1, $src2", IIC_IntGeneral, + [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "andis. $dst, $src1, $src2", IIC_IntGeneral, + [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>, + isDOT; +} +def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "ori $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>; +def ORIS : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "oris $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>; +def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "xori $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>; +def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "xoris $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>; + +def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple, + []>; +let isCodeGenOnly = 1 in { +// The POWER6 and POWER7 have special group-terminating nops. +def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins), + "ori 1, 1, 0", IIC_IntSimple, []>; +def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins), + "ori 2, 2, 0", IIC_IntSimple, []>; +} + +let isCompare = 1, hasSideEffects = 0 in { + def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm), + "cmpwi $crD, $rA, $imm", IIC_IntCompare>; + def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2), + "cmplwi $dst, $src1, $src2", IIC_IntCompare>; +} +} + +let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations. +let isCommutable = 1 in { +defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "nand", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>; +defm AND : XForm_6r<31, 28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "and", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (and i32:$rS, i32:$rB))]>; +} // isCommutable +defm ANDC : XForm_6r<31, 60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "andc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>; +let isCommutable = 1 in { +defm OR : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "or", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (or i32:$rS, i32:$rB))]>; +defm NOR : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "nor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>; +} // isCommutable +defm ORC : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "orc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>; +let isCommutable = 1 in { +defm EQV : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "eqv", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>; +defm XOR : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "xor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (xor i32:$rS, i32:$rB))]>; +} // isCommutable +defm SLW : XForm_6r<31, 24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "slw", "$rA, $rS, $rB", IIC_IntGeneral, + [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>; +defm SRW : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "srw", "$rA, $rS, $rB", IIC_IntGeneral, + [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>; +defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "sraw", "$rA, $rS, $rB", IIC_IntShift, + [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +let hasSideEffects = 0 in { +defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH), + "srawi", "$rA, $rS, $SH", IIC_IntShift, + [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>; +defm CNTLZW : XForm_11r<31, 26, (outs gprc:$rA), (ins gprc:$rS), + "cntlzw", "$rA, $rS", IIC_IntGeneral, + [(set i32:$rA, (ctlz i32:$rS))]>; +defm EXTSB : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS), + "extsb", "$rA, $rS", IIC_IntSimple, + [(set i32:$rA, (sext_inreg i32:$rS, i8))]>; +defm EXTSH : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS), + "extsh", "$rA, $rS", IIC_IntSimple, + [(set i32:$rA, (sext_inreg i32:$rS, i16))]>; + +let isCommutable = 1 in +def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "cmpb $rA, $rS, $rB", IIC_IntGeneral, + [(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>; +} +let isCompare = 1, hasSideEffects = 0 in { + def CMPW : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB), + "cmpw $crD, $rA, $rB", IIC_IntCompare>; + def CMPLW : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB), + "cmplw $crD, $rA, $rB", IIC_IntCompare>; +} +} +let PPC970_Unit = 3 in { // FPU Operations. +//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB), +// "fcmpo $crD, $fA, $fB", IIC_FPCompare>; +let isCompare = 1, hasSideEffects = 0 in { + def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), + "fcmpu $crD, $fA, $fB", IIC_FPCompare>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), + "fcmpu $crD, $fA, $fB", IIC_FPCompare>; +} + +let Uses = [RM] in { + let hasSideEffects = 0 in { + defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiw", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiwz f64:$frB))]>; + + defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), + "frsp", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fround f64:$frB))]>; + + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (frnd f64:$frB))]>; + defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (frnd f32:$frB))]>; + } + + let hasSideEffects = 0 in { + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB), + "frip", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fceil f64:$frB))]>; + defm FRIPS : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB), + "frip", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fceil f32:$frB))]>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIZD : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB), + "friz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (ftrunc f64:$frB))]>; + defm FRIZS : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB), + "friz", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (ftrunc f32:$frB))]>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIMD : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB), + "frim", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (ffloor f64:$frB))]>; + defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB), + "frim", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (ffloor f32:$frB))]>; + + defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB), + "fsqrt", "$frD, $frB", IIC_FPSqrtD, + [(set f64:$frD, (fsqrt f64:$frB))]>; + defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB), + "fsqrts", "$frD, $frB", IIC_FPSqrtS, + [(set f32:$frD, (fsqrt f32:$frB))]>; + } + } +} + +/// Note that FMR is defined as pseudo-ops on the PPC970 because they are +/// often coalesced away and we don't want the dispatch group builder to think +/// that they will fill slots (which could cause the load of a LSU reject to +/// sneak into a d-group with a store). +let hasSideEffects = 0 in +defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB), + "fmr", "$frD, $frB", IIC_FPGeneral, + []>, // (set f32:$frD, f32:$frB) + PPC970_Unit_Pseudo; + +let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations. +// These are artificially split into two different forms, for 4/8 byte FP. +defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB), + "fabs", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fabs f32:$frB))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FABSD : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB), + "fabs", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fabs f64:$frB))]>; +defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB), + "fnabs", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fneg (fabs f32:$frB)))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB), + "fnabs", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fneg (fabs f64:$frB)))]>; +defm FNEGS : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB), + "fneg", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fneg f32:$frB))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FNEGD : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB), + "fneg", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fneg f64:$frB))]>; + +defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB), + "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral, + [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB), + "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral, + [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>; + +// Reciprocal estimates. +defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB), + "fre", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfre f64:$frB))]>; +defm FRES : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB), + "fres", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfre f32:$frB))]>; +defm FRSQRTE : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB), + "frsqrte", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfrsqrte f64:$frB))]>; +defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB), + "frsqrtes", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfrsqrte f32:$frB))]>; +} + +// XL-Form instructions. condition register logical ops. +// +let hasSideEffects = 0 in +def MCRF : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA), + "mcrf $BF, $BFA", IIC_BrMCR>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +// FIXME: According to the ISA (section 2.5.1 of version 2.06), the +// condition-register logical instructions have preferred forms. Specifically, +// it is preferred that the bit specified by the BT field be in the same +// condition register as that specified by the bit BB. We might want to account +// for this via hinting the register allocator and anti-dep breakers, or we +// could constrain the register class to force this constraint and then loosen +// it during register allocation via convertToThreeAddress or some similar +// mechanism. + +let isCommutable = 1 in { +def CRAND : XLForm_1<19, 257, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crand $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>; + +def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crnand $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>; + +def CROR : XLForm_1<19, 449, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "cror $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>; + +def CRXOR : XLForm_1<19, 193, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crxor $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>; + +def CRNOR : XLForm_1<19, 33, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crnor $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>; + +def CREQV : XLForm_1<19, 289, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "creqv $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>; +} // isCommutable + +def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crandc $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>; + +def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crorc $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>; + +let isCodeGenOnly = 1 in { +def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins), + "creqv $dst, $dst, $dst", IIC_BrCR, + [(set i1:$dst, 1)]>; + +def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins), + "crxor $dst, $dst, $dst", IIC_BrCR, + [(set i1:$dst, 0)]>; + +let Defs = [CR1EQ], CRD = 6 in { +def CR6SET : XLForm_1_ext<19, 289, (outs), (ins), + "creqv 6, 6, 6", IIC_BrCR, + [(PPCcr6set)]>; + +def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins), + "crxor 6, 6, 6", IIC_BrCR, + [(PPCcr6unset)]>; +} +} + +// XFX-Form instructions. Instructions that deal with SPRs. +// + +def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR), + "mfspr $RT, $SPR", IIC_SprMFSPR>; +def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT), + "mtspr $SPR, $RT", IIC_SprMTSPR>; + +def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR), + "mftb $RT, $SPR", IIC_SprMFTB>; + +// A pseudo-instruction used to implement the read of the 64-bit cycle counter +// on a 32-bit target. +let hasSideEffects = 1, usesCustomInserter = 1 in +def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins), + "#ReadTB", []>; + +let Uses = [CTR] in { +def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins), + "mfctr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in { +def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in { +let Pattern = [(int_ppc_mtctr i32:$rS)] in +def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let Defs = [LR] in { +def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS), + "mtlr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Uses = [LR] in { +def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins), + "mflr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let isCodeGenOnly = 1 in { + // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed + // like a GPR on the PPC970. As such, copies in and out have the same + // performance characteristics as an OR instruction. + def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS), + "mtspr 256, $rS", IIC_IntGeneral>, + PPC970_DGroup_Single, PPC970_Unit_FXU; + def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins), + "mfspr $rT, 256", IIC_IntGeneral>, + PPC970_DGroup_First, PPC970_Unit_FXU; + + def MTVRSAVEv : XFXForm_7_ext<31, 467, 256, + (outs VRSAVERC:$reg), (ins gprc:$rS), + "mtspr 256, $rS", IIC_IntGeneral>, + PPC970_DGroup_Single, PPC970_Unit_FXU; + def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), + (ins VRSAVERC:$reg), + "mfspr $rT, 256", IIC_IntGeneral>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register, +// so we'll need to scavenge a register for it. +let mayStore = 1 in +def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F), + "#SPILL_VRSAVE", []>; + +// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously +// spilled), so we'll need to scavenge a register for it. +let mayLoad = 1 in +def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), + "#RESTORE_VRSAVE", []>; + +let hasSideEffects = 0 in { +def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST), + "mtocrf $FXM, $ST", IIC_BrMCRX>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS), + "mtcrf $FXM, $rS", IIC_BrMCRX>, + PPC970_MicroCode, PPC970_Unit_CRU; + +let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM), + "mfocrf $rT, $FXM", IIC_SprMFCRF>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins), + "mfcr $rT", IIC_SprMFCR>, + PPC970_MicroCode, PPC970_Unit_CRU; +} // hasSideEffects = 0 + +// Pseudo instruction to perform FADD in round-to-zero mode. +let usesCustomInserter = 1, Uses = [RM] in { + def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "", + [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>; +} + +// The above pseudo gets expanded to make use of the following instructions +// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level. +let Uses = [RM], Defs = [RM] in { + def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), + "mtfsb0 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), + "mtfsb1 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + let isCodeGenOnly = 1 in + def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT), + "mtfsf $FM, $rT", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; +} +let Uses = [RM] in { + def MFFS : XForm_42<63, 583, (outs f8rc:$rT), (ins), + "mffs $rT", IIC_IntMFFS, + [(set f64:$rT, (PPCmffs))]>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + + let Defs = [CR1] in + def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins), + "mffs. $rT", IIC_IntMFFS, []>, isDOT; +} + + +let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations. +// XO-Form instructions. Arithmetic instructions that can set overflow bit +let isCommutable = 1 in +defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i32:$rT, (add i32:$rA, i32:$rB))]>; +let isCodeGenOnly = 1 in +def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, + [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>; +let isCommutable = 1 in +defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "addc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (addc i32:$rA, i32:$rB))]>, + PPC970_DGroup_Cracked; + +defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divw", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>; +defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divwu", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>; +def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divwe $rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>, + Requires<[HasExtDiv]>; +let Defs = [CR0] in +def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divwe. $rT, $rA, $rB", IIC_IntDivW, + []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, + Requires<[HasExtDiv]>; +def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divweu $rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>, + Requires<[HasExtDiv]>; +let Defs = [CR0] in +def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divweu. $rT, $rA, $rB", IIC_IntDivW, + []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, + Requires<[HasExtDiv]>; +let isCommutable = 1 in { +defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mulhw", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>; +defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU, + [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>; +defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mullw", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i32:$rT, (mul i32:$rA, i32:$rB))]>; +} // isCommutable +defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (sub i32:$rB, i32:$rA))]>; +defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subfc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (subc i32:$rB, i32:$rA))]>, + PPC970_DGroup_Cracked; +defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA), + "neg", "$rT, $rA", IIC_IntSimple, + [(set i32:$rT, (ineg i32:$rA))]>; +let Uses = [CARRY] in { +let isCommutable = 1 in +defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "adde", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, i32:$rB))]>; +defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA), + "addme", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, -1))]>; +defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA), + "addze", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, 0))]>; +defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subfe", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (sube i32:$rB, i32:$rA))]>; +defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA), + "subfme", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (sube -1, i32:$rA))]>; +defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA), + "subfze", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (sube 0, i32:$rA))]>; +} +} + +// A-Form instructions. Most of the instructions executed in the FPU are of +// this type. +// +let PPC970_Unit = 3, hasSideEffects = 0 in { // FPU Operations. +let Uses = [RM] in { +let isCommutable = 1 in { + defm FMADD : AForm_1r<63, 29, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>; + defm FMADDS : AForm_1r<59, 29, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>; + defm FMSUB : AForm_1r<63, 28, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, + (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>; + defm FMSUBS : AForm_1r<59, 28, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, + (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>; + defm FNMADD : AForm_1r<63, 31, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, + (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>; + defm FNMADDS : AForm_1r<59, 31, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, + (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>; + defm FNMSUB : AForm_1r<63, 30, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC, + (fneg f64:$FRB))))]>; + defm FNMSUBS : AForm_1r<59, 30, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC, + (fneg f32:$FRB))))]>; +} // isCommutable +} +// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid +// having 4 of these, force the comparison to always be an 8-byte double (code +// should use an FMRSD if the input comparison value really wants to be a float) +// and 4/8 byte forms for the result and operand type.. +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FSELD : AForm_1r<63, 23, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>; +defm FSELS : AForm_1r<63, 23, + (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>; +let Uses = [RM] in { + let isCommutable = 1 in { + defm FADD : AForm_2r<63, 21, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub, + [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>; + defm FADDS : AForm_2r<59, 21, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>; + } // isCommutable + defm FDIV : AForm_2r<63, 18, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD, + [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>; + defm FDIVS : AForm_2r<59, 18, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS, + [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>; + let isCommutable = 1 in { + defm FMUL : AForm_3r<63, 25, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC), + "fmul", "$FRT, $FRA, $FRC", IIC_FPFused, + [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>; + defm FMULS : AForm_3r<59, 25, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC), + "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral, + [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>; + } // isCommutable + defm FSUB : AForm_2r<63, 20, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub, + [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>; + defm FSUBS : AForm_2r<59, 20, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>; + } +} + +let hasSideEffects = 0 in { +let PPC970_Unit = 1 in { // FXU Operations. + let isSelect = 1 in + def ISEL : AForm_4<31, 15, + (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond), + "isel $rT, $rA, $rB, $cond", IIC_IntISEL, + []>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +// M-Form instructions. rotate and mask instructions. +// +let isCommutable = 1 in { +// RLWIMI can be commuted if the rotate amount is zero. +defm RLWIMI : MForm_2r<20, (outs gprc:$rA), + (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB, + u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", + IIC_IntRotate, []>, PPC970_DGroup_Cracked, + RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; +} +let BaseName = "rlwinm" in { +def RLWINM : MForm_2<21, + (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>, RecFormRel; +let Defs = [CR0] in +def RLWINMo : MForm_2<21, + (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>, isDOT, RecFormRel, PPC970_DGroup_Cracked; +} +defm RLWNM : MForm_2r<23, (outs gprc:$rA), + (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME), + "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral, + []>; +} +} // hasSideEffects = 0 + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Patterns +// + +// Arbitrary immediate support. Implement in terms of LIS/ORI. +def : Pat<(i32 imm:$imm), + (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>; + +// Implement the 'not' operation with the NOR instruction. +def i32not : OutPatFrag<(ops node:$in), + (NOR $in, $in)>; +def : Pat<(not i32:$in), + (i32not $in)>; + +// ADD an arbitrary immediate. +def : Pat<(add i32:$in, imm:$imm), + (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>; +// OR an arbitrary immediate. +def : Pat<(or i32:$in, imm:$imm), + (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// XOR an arbitrary immediate. +def : Pat<(xor i32:$in, imm:$imm), + (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// SUBFIC +def : Pat<(sub imm32SExt16:$imm, i32:$in), + (SUBFIC $in, imm:$imm)>; + +// SHL/SRL +def : Pat<(shl i32:$in, (i32 imm:$imm)), + (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>; +def : Pat<(srl i32:$in, (i32 imm:$imm)), + (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>; + +// ROTL +def : Pat<(rotl i32:$in, i32:$sh), + (RLWNM $in, $sh, 0, 31)>; +def : Pat<(rotl i32:$in, (i32 imm:$imm)), + (RLWINM $in, imm:$imm, 0, 31)>; + +// RLWNM +def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm), + (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>; + +// Calls +def : Pat<(PPCcall (i32 tglobaladdr:$dst)), + (BL tglobaladdr:$dst)>; +def : Pat<(PPCcall (i32 texternalsym:$dst)), + (BL texternalsym:$dst)>; + +def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), + (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm), + (TCRETURNdi texternalsym:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm), + (TCRETURNri CTRRC:$dst, imm:$imm)>; + + + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>; +def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>; +def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in), + (ADDIS $in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in), + (ADDI $in, tglobaltlsaddr:$g)>; +def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS $in, tglobaladdr:$g)>; +def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)), + (ADDIS $in, tconstpool:$g)>; +def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)), + (ADDIS $in, tjumptable:$g)>; +def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)), + (ADDIS $in, tblockaddress:$g)>; + +// Support for thread-local storage. +def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", + [(set i32:$rD, (PPCppc32GOT))]>; + +// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode. +// This uses two output registers, the first as the real output, the second as a +// temporary register, used internally in code generation. +def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", + []>, NoEncode<"$rT">; + +def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg), + "#LDgotTprelL32", + [(set i32:$rD, + (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>; +def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g), + (ADD4TLS $in, tglobaltlsaddr:$g)>; + +def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDItlsgdL32", + [(set i32:$rD, + (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>; +// LR is a true define, while the rest of the Defs are clobbers. R3 is +// explicitly defined when this op is created, so not mentioned here. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in +def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), + "GETtlsADDR32", + [(set i32:$rD, + (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>; +// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded. R3 and LR +// are true defines while the rest of the Defs are clobbers. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in +def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD), + (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym), + "#ADDItlsgdLADDR32", + [(set i32:$rD, + (PPCaddiTlsgdLAddr i32:$reg, + tglobaltlsaddr:$disp, + tglobaltlsaddr:$sym))]>; +def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDItlsldL32", + [(set i32:$rD, + (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>; +// LR is a true define, while the rest of the Defs are clobbers. R3 is +// explicitly defined when this op is created, so not mentioned here. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in +def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), + "GETtlsldADDR32", + [(set i32:$rD, + (PPCgetTlsldAddr i32:$reg, + tglobaltlsaddr:$sym))]>; +// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded. R3 and LR +// are true defines while the rest of the Defs are clobbers. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, + Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in +def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD), + (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym), + "#ADDItlsldLADDR32", + [(set i32:$rD, + (PPCaddiTlsldLAddr i32:$reg, + tglobaltlsaddr:$disp, + tglobaltlsaddr:$sym))]>; +def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDIdtprelL32", + [(set i32:$rD, + (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>; +def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDISdtprelHA32", + [(set i32:$rD, + (PPCaddisDtprelHA i32:$reg, + tglobaltlsaddr:$disp))]>; + +// Support for Position-independent code +def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), + "#LWZtoc", + [(set i32:$rD, + (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +// Get Global (GOT) Base Register offset, from the word immediately preceding +// the function label. +def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; + + +// Standard shifts. These are represented separately from the real shifts above +// so that we can distinguish between shifts that allow 5-bit and 6-bit shift +// amounts. +def : Pat<(sra i32:$rS, i32:$rB), + (SRAW $rS, $rB)>; +def : Pat<(srl i32:$rS, i32:$rB), + (SRW $rS, $rB)>; +def : Pat<(shl i32:$rS, i32:$rB), + (SLW $rS, $rB)>; + +def : Pat<(zextloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX xaddr:$src)>; +def : Pat<(f64 (extloadf32 iaddr:$src)), + (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>; +def : Pat<(f64 (extloadf32 xaddr:$src)), + (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>; + +def : Pat<(f64 (fextend f32:$src)), + (COPY_TO_REGCLASS $src, F8RC)>; + +// Only seq_cst fences require the heavyweight sync (SYNC 0). +// All others can use the lightweight sync (SYNC 1). +// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits +// versions of Power. +def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>; + +// Additional FNMSUB patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B), + (FNMSUBS $A, $C, $B)>; +def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B), + (FNMSUBS $A, $C, $B)>; + +// FCOPYSIGN's operand types need not agree. +def : Pat<(fcopysign f64:$frB, f32:$frA), + (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>; +def : Pat<(fcopysign f32:$frB, f64:$frA), + (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>; + +include "PPCInstrAltivec.td" +include "PPCInstrSPE.td" +include "PPCInstr64Bit.td" +include "PPCInstrVSX.td" +include "PPCInstrQPX.td" +include "PPCInstrHTM.td" + +def crnot : OutPatFrag<(ops node:$in), + (CRNOR $in, $in)>; +def : Pat<(not i1:$in), + (crnot $in)>; + +// Patterns for arithmetic i1 operations. +def : Pat<(add i1:$a, i1:$b), + (CRXOR $a, $b)>; +def : Pat<(sub i1:$a, i1:$b), + (CRXOR $a, $b)>; +def : Pat<(mul i1:$a, i1:$b), + (CRAND $a, $b)>; + +// We're sometimes asked to materialize i1 -1, which is just 1 in this case +// (-1 is used to mean all bits set). +def : Pat<(i1 -1), (CRSET)>; + +// i1 extensions, implemented in terms of isel. +def : Pat<(i32 (zext i1:$in)), + (SELECT_I4 $in, (LI 1), (LI 0))>; +def : Pat<(i32 (sext i1:$in)), + (SELECT_I4 $in, (LI -1), (LI 0))>; + +def : Pat<(i64 (zext i1:$in)), + (SELECT_I8 $in, (LI8 1), (LI8 0))>; +def : Pat<(i64 (sext i1:$in)), + (SELECT_I8 $in, (LI8 -1), (LI8 0))>; + +// FIXME: We should choose either a zext or a sext based on other constants +// already around. +def : Pat<(i32 (anyext i1:$in)), + (SELECT_I4 $in, (LI 1), (LI 0))>; +def : Pat<(i64 (anyext i1:$in)), + (SELECT_I8 $in, (LI8 1), (LI8 0))>; + +// match setcc on i1 variables. +// CRANDC is: +// 1 1 : F +// 1 0 : T +// 0 1 : F +// 0 0 : F +// +// LT is: +// -1 -1 : F +// -1 0 : T +// 0 -1 : F +// 0 0 : F +// +// ULT is: +// 1 1 : F +// 1 0 : F +// 0 1 : T +// 0 0 : F +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)), + (CRANDC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)), + (CRANDC $s2, $s1)>; +// CRORC is: +// 1 1 : T +// 1 0 : T +// 0 1 : F +// 0 0 : T +// +// LE is: +// -1 -1 : T +// -1 0 : T +// 0 -1 : F +// 0 0 : T +// +// ULE is: +// 1 1 : T +// 1 0 : F +// 0 1 : T +// 0 0 : T +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)), + (CRORC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)), + (CRORC $s2, $s1)>; + +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)), + (CREQV $s1, $s2)>; + +// GE is: +// -1 -1 : T +// -1 0 : F +// 0 -1 : T +// 0 0 : T +// +// UGE is: +// 1 1 : T +// 1 0 : T +// 0 1 : F +// 0 0 : T +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)), + (CRORC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)), + (CRORC $s1, $s2)>; + +// GT is: +// -1 -1 : F +// -1 0 : F +// 0 -1 : T +// 0 0 : F +// +// UGT is: +// 1 1 : F +// 1 0 : T +// 0 1 : F +// 0 0 : F +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)), + (CRANDC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)), + (CRANDC $s1, $s2)>; + +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)), + (CRXOR $s1, $s2)>; + +// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE, +// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for +// floating-point types. + +multiclass CRNotPat<dag pattern, dag result> { + def : Pat<pattern, (crnot result)>; + def : Pat<(not pattern), result>; + + // We can also fold the crnot into an extension: + def : Pat<(i32 (zext pattern)), + (SELECT_I4 result, (LI 0), (LI 1))>; + def : Pat<(i32 (sext pattern)), + (SELECT_I4 result, (LI 0), (LI -1))>; + + // We can also fold the crnot into an extension: + def : Pat<(i64 (zext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 1))>; + def : Pat<(i64 (sext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 -1))>; + + // FIXME: We should choose either a zext or a sext based on other constants + // already around. + def : Pat<(i32 (anyext pattern)), + (SELECT_I4 result, (LI 0), (LI 1))>; + + def : Pat<(i64 (anyext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 1))>; +} + +// FIXME: Because of what seems like a bug in TableGen's type-inference code, +// we need to write imm:$imm in the output patterns below, not just $imm, or +// else the resulting matcher will not correctly add the immediate operand +// (making it a register operand instead). + +// extended SETCC. +multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag, + OutPatFrag rfrag, OutPatFrag rfrag8> { + def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))), + (rfrag $s1)>; + def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))), + (rfrag8 $s1)>; + def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>; + def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))), + (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>; + + def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))), + (rfrag $s1)>; + def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))), + (rfrag8 $s1)>; + def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>; + def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))), + (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>; +} + +// Note that we do all inversions below with i(32|64)not, instead of using +// (xori x, 1) because on the A2 nor has single-cycle latency while xori +// has 2-cycle latency. + +defm : ExtSetCCPat<SETEQ, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (CNTLZW $in), 27, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (CNTLZD $in), 58, 63)> >; + +defm : ExtSetCCPat<SETNE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not (CNTLZD $in)), 58, 63)> >; + +defm : ExtSetCCPat<SETLT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM $in, 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL $in, 1, 63)> >; + +defm : ExtSetCCPat<SETGE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not $in), 1, 63)> >; + +defm : ExtSetCCPat<SETGT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >; + +defm : ExtSetCCPat<SETLE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >; + +defm : ExtSetCCPat<SETLT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >; + +defm : ExtSetCCPat<SETGE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >; + +defm : ExtSetCCPat<SETGT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not $in), 1, 63)> >; + +defm : ExtSetCCPat<SETLE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM $in, 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL $in, 1, 63)> >; + +// SETCC for i32. +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +// For non-equality comparisons, the default code would materialize the +// constant, then compare against it, like this: +// lis r2, 4660 +// ori r2, r2, 22136 +// cmpw cr0, r3, r2 +// beq cr0,L6 +// Since we are just comparing for equality, we can emit this instead: +// xoris r0,r3,0x1234 +// cmplwi cr0,r0,0x5678 +// beq cr0,L6 + +def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +// SETCC for i64. +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>; +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>; + +// For non-equality comparisons, the default code would materialize the +// constant, then compare against it, like this: +// lis r2, 4660 +// ori r2, r2, 22136 +// cmpd cr0, r3, r2 +// beq cr0,L6 +// Since we are just comparing for equality, we can emit this instead: +// xoris r0,r3,0x1234 +// cmpldi cr0,r0,0x5678 +// beq cr0,L6 + +def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +// SETCC for f32. +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +// SETCC for f64. +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +// match select on i1 variables: +def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)), + (CROR (CRAND $cond , $tval), + (CRAND (crnot $cond), $fval))>; + +// match selectcc on i1 variables: +// select (lhs == rhs), tval, fval is: +// ((lhs == rhs) & tval) | (!(lhs == rhs) & fval) +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)), + (CROR (CRAND (CRANDC $lhs, $rhs), $tval), + (CRAND (CRORC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULT)), + (CROR (CRAND (CRANDC $rhs, $lhs), $tval), + (CRAND (CRORC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)), + (CROR (CRAND (CRORC $lhs, $rhs), $tval), + (CRAND (CRANDC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULE)), + (CROR (CRAND (CRORC $rhs, $lhs), $tval), + (CRAND (CRANDC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)), + (CROR (CRAND (CREQV $lhs, $rhs), $tval), + (CRAND (CRXOR $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)), + (CROR (CRAND (CRORC $rhs, $lhs), $tval), + (CRAND (CRANDC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGE)), + (CROR (CRAND (CRORC $lhs, $rhs), $tval), + (CRAND (CRANDC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)), + (CROR (CRAND (CRANDC $rhs, $lhs), $tval), + (CRAND (CRORC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGT)), + (CROR (CRAND (CRANDC $lhs, $rhs), $tval), + (CRAND (CRORC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)), + (CROR (CRAND (CREQV $lhs, $rhs), $fval), + (CRAND (CRXOR $lhs, $rhs), $tval))>; + +// match selectcc on i1 variables with non-i1 output. +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)), + (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULT)), + (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)), + (SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULE)), + (SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)), + (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)), + (SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGE)), + (SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)), + (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGT)), + (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)), + (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)), + (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULT)), + (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)), + (SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULE)), + (SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)), + (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)), + (SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGE)), + (SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)), + (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)), + (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)), + (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), + (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)), + (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)), + (SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)), + (SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)), + (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)), + (SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)), + (SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)), + (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)), + (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), + (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)), + (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)), + (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)), + (SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)), + (SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)), + (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)), + (SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)), + (SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)), + (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)), + (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), + (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)), + (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULT)), + (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)), + (SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULE)), + (SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)), + (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)), + (SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGE)), + (SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)), + (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)), + (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)), + (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +let usesCustomInserter = 1 in { +def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDIo_1_EQ_BIT", + [(set i1:$dst, (trunc (not i32:$in)))]>; +def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDIo_1_GT_BIT", + [(set i1:$dst, (trunc i32:$in))]>; + +def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDIo_1_EQ_BIT8", + [(set i1:$dst, (trunc (not i64:$in)))]>; +def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDIo_1_GT_BIT8", + [(set i1:$dst, (trunc i64:$in))]>; +} + +def : Pat<(i1 (not (trunc i32:$in))), + (ANDIo_1_EQ_BIT $in)>; +def : Pat<(i1 (not (trunc i64:$in))), + (ANDIo_1_EQ_BIT8 $in)>; + +//===----------------------------------------------------------------------===// +// PowerPC Instructions used for assembler/disassembler only +// + +// FIXME: For B=0 or B > 8, the registers following RT are used. +// WARNING: Do not add patterns for this instruction without fixing this. +def LSWI : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B), + "lswi $RT, $A, $B", IIC_LdStLoad, []>; + +// FIXME: For B=0 or B > 8, the registers following RT are used. +// WARNING: Do not add patterns for this instruction without fixing this. +def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B), + "stswi $RT, $A, $B", IIC_LdStLoad, []>; + +def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins), + "isync", IIC_SprISYNC, []>; + +def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src), + "icbi $src", IIC_LdStICBI, []>; + +// We used to have EIEIO as value but E[0-9A-Z] is a reserved name +def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins), + "eieio", IIC_LdStLoad, []>; + +def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L), + "wait $L", IIC_LdStLoad, []>; + +def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO), + "mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>; + +def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR), + "mtsr $SR, $RS", IIC_SprMTSR>; + +def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR), + "mfsr $RS, $SR", IIC_SprMFSR>; + +def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB), + "mtsrin $RS, $RB", IIC_SprMTSR>; + +def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB), + "mfsrin $RS, $RB", IIC_SprMFSR>; + +def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L), + "mtmsr $RS, $L", IIC_SprMTMSR>; + +def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS), + "wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> { + let L = 0; +} + +def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>, + Requires<[IsBookE]> { + bits<1> E; + + let Inst{16} = E; + let Inst{21-30} = 163; +} + +def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B), + "dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>; +def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B), + "iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>; + +def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>; +def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>; +def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>; +def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>; + +def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins), + "mfmsr $RT", IIC_SprMFMSR, []>; + +def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L), + "mtmsrd $RS, $L", IIC_SprMTMSRD>; + +def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA), + "mcrfs $BF, $BFA", IIC_BrMCR>; + +def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), + "mtfsfi $BF, $U, $W", IIC_IntMFFS>; + +def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), + "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT; + +def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>; +def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>; + +def MTFSF : XFLForm_1<63, 711, (outs), + (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), + "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>; +def MTFSFo : XFLForm_1<63, 711, (outs), + (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), + "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT; + +def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>; +def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>; + +def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB), + "slbie $RB", IIC_SprSLBIE, []>; + +def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB), + "slbmte $RS, $RB", IIC_SprSLBMTE, []>; + +def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB), + "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>; + +def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>; + +def TLBIA : XForm_0<31, 370, (outs), (ins), + "tlbia", IIC_SprTLBIA, []>; + +def TLBSYNC : XForm_0<31, 566, (outs), (ins), + "tlbsync", IIC_SprTLBSYNC, []>; + +def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB), + "tlbiel $RB", IIC_SprTLBIEL, []>; + +def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB), + "tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>; +def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB), + "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>; + +def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB), + "tlbie $RB,$RS", IIC_SprTLBIE, []>; + +def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B", + IIC_LdStLoad>, Requires<[IsBookE]>; + +def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B", + IIC_LdStLoad>, Requires<[IsBookE]>; + +def TLBRE : XForm_24_eieio<31, 946, (outs), (ins), + "tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>; + +def TLBWE : XForm_24_eieio<31, 978, (outs), (ins), + "tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>; + +def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS), + "tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>; + +def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS), + "tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>; + +def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), + "tlbsx $RST, $A, $B", IIC_LdStLoad, []>, + Requires<[IsPPC4xx]>; +def TLBSX2D : XForm_base_r3xo<31, 914, (outs), + (ins gprc:$RST, gprc:$A, gprc:$B), + "tlbsx. $RST, $A, $B", IIC_LdStLoad, []>, + Requires<[IsPPC4xx]>, isDOT; + +def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>; + +def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>, + Requires<[IsBookE]>; +def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>, + Requires<[IsBookE]>; + +def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>, + Requires<[IsE500]>; +def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>, + Requires<[IsE500]>; + +def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR), + "mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>; +def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR), + "mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>; + +def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>; + +def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B), + "lbzcix $RST, $A, $B", IIC_LdStLoad, []>; +def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B), + "lhzcix $RST, $A, $B", IIC_LdStLoad, []>; +def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B), + "lwzcix $RST, $A, $B", IIC_LdStLoad, []>; +def LDCIX : XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B), + "ldcix $RST, $A, $B", IIC_LdStLoad, []>; + +def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), + "stbcix $RST, $A, $B", IIC_LdStLoad, []>; +def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), + "sthcix $RST, $A, $B", IIC_LdStLoad, []>; +def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), + "stwcix $RST, $A, $B", IIC_LdStLoad, []>; +def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), + "stdcix $RST, $A, $B", IIC_LdStLoad, []>; + +//===----------------------------------------------------------------------===// +// PowerPC Assembler Instruction Aliases +// + +// Pseudo-instructions for alternate assembly syntax (never used by codegen). +// These are aliases that require C++ handling to convert to the target +// instruction, while InstAliases can be handled directly by tblgen. +class PPCAsmPseudo<string asm, dag iops> + : Instruction { + let Namespace = "PPC"; + bit PPC64 = 0; // Default value, override with isPPC64 + + let OutOperandList = (outs); + let InOperandList = iops; + let Pattern = []; + let AsmString = asm; + let isAsmParserOnly = 1; + let isPseudo = 1; +} + +def : InstAlias<"sc", (SC 0)>; + +def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>; +def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>; +def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>; +def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>; + +def : InstAlias<"wait", (WAIT 0)>; +def : InstAlias<"waitrsv", (WAIT 1)>; +def : InstAlias<"waitimpl", (WAIT 2)>; + +def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>; + +def DCBTx : PPCAsmPseudo<"dcbt $dst", (ins memrr:$dst)>; +def DCBTSTx : PPCAsmPseudo<"dcbtst $dst", (ins memrr:$dst)>; + +def DCBTCT : PPCAsmPseudo<"dcbtct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>; +def DCBTDS : PPCAsmPseudo<"dcbtds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>; +def DCBTT : PPCAsmPseudo<"dcbtt $dst", (ins memrr:$dst)>; + +def DCBTSTCT : PPCAsmPseudo<"dcbtstct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>; +def DCBTSTDS : PPCAsmPseudo<"dcbtstds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>; +def DCBTSTT : PPCAsmPseudo<"dcbtstt $dst", (ins memrr:$dst)>; + +def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; +def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; +def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>; +def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>; + +def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>; +def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>; + +def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>; +def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>; + +def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>; +def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>; + +def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>; +def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>; + +def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>; +def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>; + +def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>; +def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>; + +def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>; +def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>; + +def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>; +def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>; + +def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>; +def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>; + +def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>; +def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>; + +def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>; +def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>; + +def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>; +def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>; + +def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>; +def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>; +def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>; + +def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>; +def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>; + +def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"xnop", (XORI R0, R0, 0)>; + +def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; + +def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; + +def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>; + +foreach BATR = 0-3 in { + def : InstAlias<"mtdbatu "#BATR#", $Rx", + (MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mfdbatu $Rx, "#BATR, + (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mtdbatl "#BATR#", $Rx", + (MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mfdbatl $Rx, "#BATR, + (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mtibatu "#BATR#", $Rx", + (MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mfibatu $Rx, "#BATR, + (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mtibatl "#BATR#", $Rx", + (MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>, + Requires<[IsPPC6xx]>; + def : InstAlias<"mfibatl $Rx, "#BATR, + (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>, + Requires<[IsPPC6xx]>; +} + +foreach BR = 0-7 in { + def : InstAlias<"mfbr"#BR#" $Rx", + (MFDCR gprc:$Rx, !add(BR, 0x80))>, + Requires<[IsPPC4xx]>; + def : InstAlias<"mtbr"#BR#" $Rx", + (MTDCR gprc:$Rx, !add(BR, 0x80))>, + Requires<[IsPPC4xx]>; +} + +def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>; + +def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>; +def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>; + +def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>; +def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>; + +def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>; + +def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; + +def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; + +def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>; +def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>; + +def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>; +def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>; + +foreach SPRG = 0-3 in { + def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>; + def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>; + def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>; + def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>; +} +foreach SPRG = 4-7 in { + def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>, + Requires<[IsBookE]>; + def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>, + Requires<[IsBookE]>; + def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>, + Requires<[IsBookE]>; + def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>, + Requires<[IsBookE]>; +} + +def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>; + +def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>; +def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>; + +def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>; + +def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>; +def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>; + +def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>; +def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>; +def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>; +def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>; + +def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>; + +def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>, + Requires<[IsPPC4xx]>; +def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>, + Requires<[IsPPC4xx]>; +def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>, + Requires<[IsPPC4xx]>; +def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>, + Requires<[IsPPC4xx]>; + +def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; +def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; + +def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; +def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; +def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; +def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; +def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; +def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; + +def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +// The POWER variant +def : MnemonicAlias<"cntlz", "cntlzw">; +def : MnemonicAlias<"cntlz.", "cntlzw.">; + +def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; +def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; + +def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; +def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; +def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; +def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; +def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; + +def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; +def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; +def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; +def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; +def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; +def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; + +// These generic branch instruction forms are used for the assembler parser only. +// Defs and Uses are conservative, since we don't know the BO value. +let PPC970_Unit = 7 in { + let Defs = [CTR], Uses = [CTR, RM] in { + def gBC : BForm_3<16, 0, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst), + "bc $bo, $bi, $dst">; + def gBCA : BForm_3<16, 1, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst), + "bca $bo, $bi, $dst">; + } + let Defs = [LR, CTR], Uses = [CTR, RM] in { + def gBCL : BForm_3<16, 0, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst), + "bcl $bo, $bi, $dst">; + def gBCLA : BForm_3<16, 1, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst), + "bcla $bo, $bi, $dst">; + } + let Defs = [CTR], Uses = [CTR, LR, RM] in + def gBCLR : XLForm_2<19, 16, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bclr $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [LR, CTR], Uses = [CTR, LR, RM] in + def gBCLRL : XLForm_2<19, 16, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bclrl $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [CTR], Uses = [CTR, LR, RM] in + def gBCCTR : XLForm_2<19, 528, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bcctr $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [LR, CTR], Uses = [CTR, LR, RM] in + def gBCCTRL : XLForm_2<19, 528, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bcctrl $bo, $bi, $bh", IIC_BrB, []>; +} +def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>; + +multiclass BranchSimpleMnemonic1<string name, string pm, int bo> { + def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>; + def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>; +} +multiclass BranchSimpleMnemonic2<string name, string pm, int bo> + : BranchSimpleMnemonic1<name, pm, bo> { + def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>; + def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>; +} +defm : BranchSimpleMnemonic2<"t", "", 12>; +defm : BranchSimpleMnemonic2<"f", "", 4>; +defm : BranchSimpleMnemonic2<"t", "-", 14>; +defm : BranchSimpleMnemonic2<"f", "-", 6>; +defm : BranchSimpleMnemonic2<"t", "+", 15>; +defm : BranchSimpleMnemonic2<"f", "+", 7>; +defm : BranchSimpleMnemonic1<"dnzt", "", 8>; +defm : BranchSimpleMnemonic1<"dnzf", "", 0>; +defm : BranchSimpleMnemonic1<"dzt", "", 10>; +defm : BranchSimpleMnemonic1<"dzf", "", 2>; + +multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> { + def : InstAlias<"b"#name#pm#" $cc, $dst", + (BCC bibo, crrc:$cc, condbrtarget:$dst)>; + def : InstAlias<"b"#name#pm#" $dst", + (BCC bibo, CR0, condbrtarget:$dst)>; + + def : InstAlias<"b"#name#"a"#pm#" $cc, $dst", + (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"a"#pm#" $dst", + (BCCA bibo, CR0, abscondbrtarget:$dst)>; + + def : InstAlias<"b"#name#"lr"#pm#" $cc", + (BCCLR bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"lr"#pm, + (BCCLR bibo, CR0)>; + + def : InstAlias<"b"#name#"ctr"#pm#" $cc", + (BCCCTR bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"ctr"#pm, + (BCCCTR bibo, CR0)>; + + def : InstAlias<"b"#name#"l"#pm#" $cc, $dst", + (BCCL bibo, crrc:$cc, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"l"#pm#" $dst", + (BCCL bibo, CR0, condbrtarget:$dst)>; + + def : InstAlias<"b"#name#"la"#pm#" $cc, $dst", + (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"la"#pm#" $dst", + (BCCLA bibo, CR0, abscondbrtarget:$dst)>; + + def : InstAlias<"b"#name#"lrl"#pm#" $cc", + (BCCLRL bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"lrl"#pm, + (BCCLRL bibo, CR0)>; + + def : InstAlias<"b"#name#"ctrl"#pm#" $cc", + (BCCCTRL bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"ctrl"#pm, + (BCCCTRL bibo, CR0)>; +} +multiclass BranchExtendedMnemonic<string name, int bibo> { + defm : BranchExtendedMnemonicPM<name, "", bibo>; + defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>; + defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>; +} +defm : BranchExtendedMnemonic<"lt", 12>; +defm : BranchExtendedMnemonic<"gt", 44>; +defm : BranchExtendedMnemonic<"eq", 76>; +defm : BranchExtendedMnemonic<"un", 108>; +defm : BranchExtendedMnemonic<"so", 108>; +defm : BranchExtendedMnemonic<"ge", 4>; +defm : BranchExtendedMnemonic<"nl", 4>; +defm : BranchExtendedMnemonic<"le", 36>; +defm : BranchExtendedMnemonic<"ng", 36>; +defm : BranchExtendedMnemonic<"ne", 68>; +defm : BranchExtendedMnemonic<"nu", 100>; +defm : BranchExtendedMnemonic<"ns", 100>; + +def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>; +def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>; +def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>; +def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>; +def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>; +def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>; + +def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>; +def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>; +def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>; +def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>; +def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>; +def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>; + +multiclass TrapExtendedMnemonic<string name, int to> { + def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>; + def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>; + def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>; + def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>; +} +defm : TrapExtendedMnemonic<"lt", 16>; +defm : TrapExtendedMnemonic<"le", 20>; +defm : TrapExtendedMnemonic<"eq", 4>; +defm : TrapExtendedMnemonic<"ge", 12>; +defm : TrapExtendedMnemonic<"gt", 8>; +defm : TrapExtendedMnemonic<"nl", 12>; +defm : TrapExtendedMnemonic<"ne", 24>; +defm : TrapExtendedMnemonic<"ng", 20>; +defm : TrapExtendedMnemonic<"llt", 2>; +defm : TrapExtendedMnemonic<"lle", 6>; +defm : TrapExtendedMnemonic<"lge", 5>; +defm : TrapExtendedMnemonic<"lgt", 1>; +defm : TrapExtendedMnemonic<"lnl", 5>; +defm : TrapExtendedMnemonic<"lng", 6>; +defm : TrapExtendedMnemonic<"u", 31>; + +// Atomic loads +def : Pat<(atomic_load_8 iaddr:$src), (LBZ memri:$src)>; +def : Pat<(atomic_load_16 iaddr:$src), (LHZ memri:$src)>; +def : Pat<(atomic_load_32 iaddr:$src), (LWZ memri:$src)>; +def : Pat<(atomic_load_8 xaddr:$src), (LBZX memrr:$src)>; +def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>; +def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>; + +// Atomic stores +def : Pat<(atomic_store_8 iaddr:$ptr, i32:$val), (STB gprc:$val, memri:$ptr)>; +def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH gprc:$val, memri:$ptr)>; +def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW gprc:$val, memri:$ptr)>; +def : Pat<(atomic_store_8 xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>; +def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>; +def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td new file mode 100644 index 0000000..4312007 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td @@ -0,0 +1,1216 @@ +//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the QPX extension to the PowerPC instruction set. +// Reference: +// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. +// +//===----------------------------------------------------------------------===// + +def PPCRegQFRCAsmOperand : AsmOperandClass { + let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; +} +def qfrc : RegisterOperand<QFRC> { + let ParserMatchClass = PPCRegQFRCAsmOperand; +} +def PPCRegQSRCAsmOperand : AsmOperandClass { + let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; +} +def qsrc : RegisterOperand<QSRC> { + let ParserMatchClass = PPCRegQSRCAsmOperand; +} +def PPCRegQBRCAsmOperand : AsmOperandClass { + let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; +} +def qbrc : RegisterOperand<QBRC> { + let ParserMatchClass = PPCRegQBRCAsmOperand; +} + +//===----------------------------------------------------------------------===// +// Helpers for defining instructions that directly correspond to intrinsics. + +// QPXA1_Int - A AForm_1 intrinsic definition. +class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused, + [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; +// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). +class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm, + [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; +// QPXA2_Int - A AForm_2 intrinsic definition. +class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral, + [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>; +// QPXA3_Int - A AForm_3 intrinsic definition. +class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), + !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral, + [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>; +// QPXA4_Int - A AForm_4a intrinsic definition. +class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB), + !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral, + [(set v4f64:$FRT, (IntID v4f64:$FRB))]>; +// QPXX18_Int - A XForm_18 intrinsic definition. +class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID> + : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare, + [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>; +// QPXX19_Int - A XForm_19 intrinsic definition. +class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID> + : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB), + !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral, + [(set v4f64:$FRT, (IntID v4f64:$FRB))]>; + +//===----------------------------------------------------------------------===// +// Pattern Frags. + +def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32; +}]>; + +def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32; +}]>; +def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncst node:$val, + node:$base, node:$offset), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32; +}]>; + +def fround_inexact : PatFrag<(ops node:$val), (fround node:$val), [{ + return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0; +}]>; + +def fround_exact : PatFrag<(ops node:$val), (fround node:$val), [{ + return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1; +}]>; + +let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. + def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>; + +//===----------------------------------------------------------------------===// +// Instruction Definitions. + +def HasQPX : Predicate<"PPCSubTarget->hasQPX()">; +let Predicates = [HasQPX] in { +let DecoderNamespace = "QPX" in { +let hasSideEffects = 0 in { // QPX instructions don't have side effects. +let Uses = [RM] in { + // Add Instructions + let isCommutable = 1 in { + def QVFADD : AForm_2<4, 21, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; + def QVFADDSs : AForm_2<0, 21, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; + } + def QVFSUB : AForm_2<4, 20, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; + def QVFSUBSs : AForm_2<0, 20, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; + + // Estimate Instructions + def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfre $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; + def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; + let isCodeGenOnly = 1 in + def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfres $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; + + def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; + def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; + let isCodeGenOnly = 1 in + def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; + + // Multiply Instructions + let isCommutable = 1 in { + def QVFMUL : AForm_3<4, 25, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), + "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, + [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; + let isCodeGenOnly = 1 in + def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; + def QVFMULSs : AForm_3<0, 25, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), + "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, + [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; + } + def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; + def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; + + // Multiply-add instructions + def QVFMADD : AForm_1<4, 29, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; + def QVFMADDSs : AForm_1<0, 29, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; + def QVFNMADD : AForm_1<4, 31, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, + v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; + def QVFNMADDSs : AForm_1<0, 31, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, + v4f32:$FRB)))]>; + def QVFMSUB : AForm_1<4, 28, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, + (fneg v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; + def QVFMSUBSs : AForm_1<0, 28, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, + (fneg v4f32:$FRB)))]>; + def QVFNMSUB : AForm_1<4, 30, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, + (fneg v4f64:$FRB))))]>; + let isCodeGenOnly = 1 in + def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; + def QVFNMSUBSs : AForm_1<0, 30, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, + (fneg v4f32:$FRB))))]>; + def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; + def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; + def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; + def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; + def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; + def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; + def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; + def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; + + // Select Instruction + let isCodeGenOnly = 1 in + def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; + def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), + (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (vselect v4i1:$FRA, + v4f64:$FRC, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), + (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (vselect v4i1:$FRA, + v4f32:$FRC, v4f32:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), + (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4i1:$FRT, (vselect v4i1:$FRA, + v4i1:$FRC, v4i1:$FRB))]>; + + // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after + // instruction selection into a branch sequence. + let usesCustomInserter = 1 in { + def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, + i32imm:$BROPC), "#SELECT_CC_QFRC", + []>; + def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, + i32imm:$BROPC), "#SELECT_CC_QSRC", + []>; + def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, + i32imm:$BROPC), "#SELECT_CC_QBRC", + []>; + + // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition + // register bit directly. + def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond, + qfrc:$T, qfrc:$F), "#SELECT_QFRC", + [(set v4f64:$dst, + (select i1:$cond, v4f64:$T, v4f64:$F))]>; + def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond, + qsrc:$T, qsrc:$F), "#SELECT_QSRC", + [(set v4f32:$dst, + (select i1:$cond, v4f32:$T, v4f32:$F))]>; + def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond, + qbrc:$T, qbrc:$F), "#SELECT_QBRC", + [(set v4i1:$dst, + (select i1:$cond, v4i1:$T, v4i1:$F))]>; + } + + // Convert and Round Instructions + def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; + let isCodeGenOnly = 1 in + def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; + + def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; + def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; + def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; + def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; + def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; + def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; + def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; + def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; + let isCodeGenOnly = 1 in + def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; + + def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; + def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; + def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; + + let isCodeGenOnly = 1 in + def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; + def QVFRSPs : XForm_19<4, 12, + (outs qsrc:$FRT), (ins qfrc:$FRB), + "qvfrsp $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; + + def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfriz $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfriz $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; + + def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (frnd v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (frnd v4f32:$FRB))]>; + + def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrip $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrip $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; + + def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrim $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrim $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; + + // Move Instructions + def QVFMR : XForm_19<4, 72, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4f64:$FRT, v4f64:$FRB) */]>; + let isCodeGenOnly = 1 in { + def QVFMRs : XForm_19<4, 72, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4f32:$FRT, v4f32:$FRB) */]>; + def QVFMRb : XForm_19<4, 72, + (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4i1:$FRT, v4i1:$FRB) */]>; + } + def QVFNEG : XForm_19<4, 40, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfneg $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFNEGs : XForm_19<4, 40, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfneg $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; + def QVFABS : XForm_19<4, 264, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfabs $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFABSs : XForm_19<4, 264, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfabs $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; + def QVFNABS : XForm_19<4, 136, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfnabs $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFNABSs : XForm_19<4, 136, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfnabs $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; + def QVFCPSGN : XForm_18<4, 8, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; + let isCodeGenOnly = 1 in + def QVFCPSGNs : XForm_18<4, 8, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; + + def QVALIGNI : Z23Form_1<4, 5, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvaligni v4f64:$FRA, v4f64:$FRB, + (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVALIGNIs : Z23Form_1<4, 5, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvaligni v4f32:$FRA, v4f32:$FRB, + (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVALIGNIb : Z23Form_1<4, 5, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4i1:$FRT, + (PPCqvaligni v4i1:$FRA, v4i1:$FRB, + (i32 imm:$idx)))]>; + + def QVESPLATI : Z23Form_2<4, 37, + (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVESPLATIs : Z23Form_2<4, 37, + (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVESPLATIb : Z23Form_2<4, 37, + (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4i1:$FRT, + (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; + + def QVFPERM : AForm_1<4, 6, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; + let isCodeGenOnly = 1 in + def QVFPERMs : AForm_1<4, 6, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), + "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; + + let isReMaterializable = 1, isAsCheapAsAMove = 1 in + def QVGPCI : Z23Form_3<4, 133, + (outs qfrc:$FRT), (ins u12imm:$idx), + "qvgpci $FRT, $idx", IIC_VecPerm, + [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; + + // Compare Instruction + let isCodeGenOnly = 1 in + def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; + def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; + let isCodeGenOnly = 1 in + def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; + let isCodeGenOnly = 1 in + def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; + def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; + let isCodeGenOnly = 1 in + def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; + let isCodeGenOnly = 1 in + def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; + def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; + let isCodeGenOnly = 1 in + def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; + let isCodeGenOnly = 1 in + def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; + def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; + let isCodeGenOnly = 1 in + def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; + + let isCodeGenOnly = 1 in + def QVFLOGICAL : XForm_20<4, 4, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + def QVFLOGICALb : XForm_20<4, 4, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + let isCodeGenOnly = 1 in + def QVFLOGICALs : XForm_20<4, 4, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + + // Load indexed instructions + let mayLoad = 1 in { + def QVLFDX : XForm_1<31, 583, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfdx $FRT, $src", IIC_LdStLFD, + [(set v4f64:$FRT, (load xoaddr:$src))]>; + let isCodeGenOnly = 1 in + def QVLFDXb : XForm_1<31, 583, + (outs qbrc:$FRT), (ins memrr:$src), + "qvlfdx $FRT, $src", IIC_LdStLFD, []>; + + let RC = 1 in + def QVLFDXA : XForm_1<31, 583, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFDUX : XForm_1<31, 615, + (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), + (ins memrr:$src), + "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, + RegConstraint<"$src.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + let RC = 1 in + def QVLFDUXA : XForm_1<31, 615, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFSX : XForm_1<31, 519, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; + + let isCodeGenOnly = 1 in + def QVLFSXb : XForm_1<31, 519, + (outs qbrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; + let isCodeGenOnly = 1 in + def QVLFSXs : XForm_1<31, 519, + (outs qsrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4f32:$FRT, (load xoaddr:$src))]>; + + let RC = 1 in + def QVLFSXA : XForm_1<31, 519, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFSUX : XForm_1<31, 551, + (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), + (ins memrr:$src), + "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, + RegConstraint<"$src.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + + let RC = 1 in + def QVLFSUXA : XForm_1<31, 551, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCDX : XForm_1<31, 71, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCDXA : XForm_1<31, 71, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCDUX : XForm_1<31, 103, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCDUXA : XForm_1<31, 103, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCSX : XForm_1<31, 7, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; + let isCodeGenOnly = 1 in + def QVLFCSXs : XForm_1<31, 7, + (outs qsrc:$FRT), (ins memrr:$src), + "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; + + let RC = 1 in + def QVLFCSXA : XForm_1<31, 7, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCSUX : XForm_1<31, 39, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCSUXA : XForm_1<31, 39, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFIWAX : XForm_1<31, 871, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFIWAXA : XForm_1<31, 871, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFIWZX : XForm_1<31, 839, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFIWZXA : XForm_1<31, 839, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; + } + + + def QVLPCLDX : XForm_1<31, 582, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; + def QVLPCLSX : XForm_1<31, 518, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; + let isCodeGenOnly = 1 in + def QVLPCLSXint : XForm_11<31, 518, + (outs qfrc:$FRT), (ins G8RC:$src), + "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; + def QVLPCRDX : XForm_1<31, 70, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; + def QVLPCRSX : XForm_1<31, 6, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; + + // Store indexed instructions + let mayStore = 1 in { + def QVSTFDX : XForm_8<31, 711, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdx $FRT, $dst", IIC_LdStSTFD, + [(store qfrc:$FRT, xoaddr:$dst)]>; + let isCodeGenOnly = 1 in + def QVSTFDXb : XForm_8<31, 711, + (outs), (ins qbrc:$FRT, memrr:$dst), + "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; + + let RC = 1 in + def QVSTFDXA : XForm_8<31, 711, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), + (ins qfrc:$FRT, memrr:$dst), + "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + + let RC = 1 in + def QVSTFDUXA : XForm_8<31, 743, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDXI : XForm_8<31, 709, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFDXIA : XForm_8<31, 709, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDUXI : XForm_8<31, 741, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFDUXIA : XForm_8<31, 741, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSX : XForm_8<31, 647, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsx $FRT, $dst", IIC_LdStSTFD, + [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; + let isCodeGenOnly = 1 in + def QVSTFSXs : XForm_8<31, 647, + (outs), (ins qsrc:$FRT, memrr:$dst), + "qvstfsx $FRT, $dst", IIC_LdStSTFD, + [(store qsrc:$FRT, xoaddr:$dst)]>; + + let RC = 1 in + def QVSTFSXA : XForm_8<31, 647, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), + (ins qsrc:$FRT, memrr:$dst), + "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + let isCodeGenOnly = 1 in + def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), + (ins qfrc:$FRT, memrr:$dst), + "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + + let RC = 1 in + def QVSTFSUXA : XForm_8<31, 679, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSXI : XForm_8<31, 645, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFSXIA : XForm_8<31, 645, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSUXI : XForm_8<31, 677, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFSUXIA : XForm_8<31, 677, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDX : XForm_8<31, 199, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDXA : XForm_8<31, 199, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSX : XForm_8<31, 135, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; + let isCodeGenOnly = 1 in + def QVSTFCSXs : XForm_8<31, 135, + (outs), (ins qsrc:$FRT, memrr:$dst), + "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; + + let RC = 1 in + def QVSTFCSXA : XForm_8<31, 135, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDUX : XForm_8<31, 231, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDUXA : XForm_8<31, 231, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSUX : XForm_8<31, 167, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSUXA : XForm_8<31, 167, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDXI : XForm_8<31, 197, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDXIA : XForm_8<31, 197, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSXI : XForm_8<31, 133, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSXIA : XForm_8<31, 133, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDUXI : XForm_8<31, 229, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDUXIA : XForm_8<31, 229, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSUXI : XForm_8<31, 165, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSUXIA : XForm_8<31, 165, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFIWX : XForm_8<31, 967, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFIWXA : XForm_8<31, 967, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; + } +} + +} // neverHasSideEffects +} + +def : InstAlias<"qvfclr $FRT", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; +def : InstAlias<"qvfand $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; +def : InstAlias<"qvfandc $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; +def : InstAlias<"qvfctfb $FRT, $FRA", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; +def : InstAlias<"qvfxor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; +def : InstAlias<"qvfor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; +def : InstAlias<"qvfnor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; +def : InstAlias<"qvfequ $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; +def : InstAlias<"qvfnot $FRT, $FRA", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; +def : InstAlias<"qvforc $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; +def : InstAlias<"qvfnand $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; +def : InstAlias<"qvfset $FRT", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; + +//===----------------------------------------------------------------------===// +// Additional QPX Patterns +// + +def : Pat<(v4f64 (scalar_to_vector f64:$A)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; +def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, 0)), + (EXTRACT_SUBREG $S, sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 0)), + (EXTRACT_SUBREG $S, sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, 1)), + (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; +def : Pat<(f64 (extractelt v4f64:$S, 2)), + (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; +def : Pat<(f64 (extractelt v4f64:$S, 3)), + (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; + +def : Pat<(f32 (extractelt v4f32:$S, 1)), + (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 2)), + (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 3)), + (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), + (EXTRACT_SUBREG (QVFPERM $S, $S, + (QVLPCLSXint (RLDICR $F, 2, + /* 63-2 = */ 61))), + sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), + (EXTRACT_SUBREG (QVFPERMs $S, $S, + (QVLPCLSXint (RLDICR $F, 2, + /* 63-2 = */ 61))), + sub_64)>; + +def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), + (QVFPERM $A, $B, $C)>; + +def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), + (QVFCPSGN $A, $B)>; + +// FCOPYSIGN's operand types need not agree. +def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), + (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; +def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), + (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; + +def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; +def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; +def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; + +def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; +def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; +def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; +def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; + +def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; +def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; + +def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), + (QVFADD $A, $B)>; +def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), + (QVFSUB $A, $B)>; +def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), + (QVFMUL $A, $B)>; + +// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), + (QVFNMSUB $A, $B, $C)>; +def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), + (QVFNMSUB $A, $B, $C)>; +def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), + (QVFNMSUBSs $A, $B, $C)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), + (QVFNMSUBSs $A, $B, $C)>; + +def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), + (QVFMADD $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), + (QVFNMADD $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), + (QVFMSUB $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), + (QVFNMSUB $A, $B, $C)>; + +def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), + (QVLFDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), + (QVLFDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), + (QVLFSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), + (QVLFSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), + (QVLFCDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), + (QVLFCDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), + (QVLFCSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), + (QVLFCSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), + (QVLFDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), + (QVLFIWAXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), + (QVLFIWAX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), + (QVLFIWZXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), + (QVLFIWZX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), + (QVLFSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), + (QVLPCLDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), + (QVLPCLSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), + (QVLPCRDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), + (QVLPCRSX xoaddr:$src)>; + +def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), + (QVSTFDX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), + (QVSTFSX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), + (QVSTFCDXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), + (QVSTFCDX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), + (QVSTFCSXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), + (QVSTFCSX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), + (QVSTFDXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), + (QVSTFIWXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), + (QVSTFIWX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), + (QVSTFSXA $T, xoaddr:$dst)>; + +def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFDUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFSUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFSUXs $rS, $ptrreg, $ptroff)>; + +def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), + (QVFLOGICAL $A, $B, imm:$idx)>; +def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), + (QVGPCI imm:$idx)>; + +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPLTb $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPGTb $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPEQb $FRA, $FRB), (i32 13))>; + +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), + (QVFCMPEQb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), + (QVFCMPGTb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFCMPLTb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), + (QVFCMPLTb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFCMPGTb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFCMPEQb $FRA, $FRB), (i32 10))>; + +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPLTbs $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPGTbs $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPEQbs $FRA, $FRB), (i32 13))>; + +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), + (QVFCMPEQbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), + (QVFCMPGTbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFCMPLTbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), + (QVFCMPLTbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFCMPGTbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFCMPEQbs $FRA, $FRB), (i32 10))>; + +def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 4))>; +def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 8))>; +def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 9))>; +def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 13))>; +def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 14))>; + +def : Pat<(and v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 1))>; +def : Pat<(or v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 7))>; +def : Pat<(xor v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 6))>; +def : Pat<(not v4i1:$FRA), + (QVFLOGICALb $FRA, $FRA, (i32 10))>; + +def : Pat<(v4f64 (fextend v4f32:$src)), + (COPY_TO_REGCLASS $src, QFRC)>; + +def : Pat<(v4f32 (fround_exact v4f64:$src)), + (COPY_TO_REGCLASS $src, QSRC)>; + +// Extract the underlying floating-point values from the +// QPX (-1.0, 1.0) boolean representation. +def : Pat<(v4f64 (PPCqbflt v4i1:$src)), + (COPY_TO_REGCLASS $src, QFRC)>; + +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), + (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), + (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), + (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), + (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), + (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), + (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), + (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), + (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), + (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), + (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), + (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), + (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), + (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), + (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), + (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), + (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), + (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), + (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), + (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), + (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), + (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), + (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), + (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), + (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), + (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), + (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), + (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), + (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), + (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), + (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +} // end HasQPX + +let Predicates = [HasQPX, NoNaNsFPMath] in { +def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; +def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; + +def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; +def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; +} + +let Predicates = [HasQPX, NaNsFPMath] in { +// When either of these operands is NaN, we should return the other operand. +// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need +// to explicitly or with a NaN test on the second operand. +def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; + +def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td new file mode 100644 index 0000000..cc3a4d2 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td @@ -0,0 +1,447 @@ +//=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Signal Processing Engine extension to +// the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + + let Pattern = []; + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-31} = xo; +} + +class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> { + let RB = 0; +} + +class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> { + bits<3> crD; + bits<5> RA; + bits<5> RB; + + let Pattern = []; + + let Inst{6-8} = crD; + let Inst{9-10} = 0; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-31} = xo; +} + +class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<21> D; + + let Pattern = []; + + let Inst{6-10} = RT; + let Inst{20} = D{0}; + let Inst{19} = D{1}; + let Inst{18} = D{2}; + let Inst{17} = D{3}; + let Inst{16} = D{4}; + let Inst{15} = D{5}; + let Inst{14} = D{6}; + let Inst{13} = D{7}; + let Inst{12} = D{8}; + let Inst{11} = D{9}; + let Inst{11-20} = D{0-9}; + let Inst{21-31} = xo; +} + +let Predicates = [HasSPE], isAsmParserOnly = 1 in { + +def EVLDD : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst), + "evldd $RT, $dst", IIC_VecFP>; +def EVLDW : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst), + "evldw $RT, $dst", IIC_VecFP>; +def EVLDH : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst), + "evldh $RT, $dst", IIC_VecFP>; +def EVLHHESPLAT : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst), + "evlhhesplat $RT, $dst", IIC_VecFP>; +def EVLHHOUSPLAT : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst), + "evlhhousplat $RT, $dst", IIC_VecFP>; +def EVLHHOSSPLAT : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst), + "evlhhossplat $RT, $dst", IIC_VecFP>; +def EVLWHE : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst), + "evlwhe $RT, $dst", IIC_VecFP>; +def EVLWHOU : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst), + "evlwhou $RT, $dst", IIC_VecFP>; +def EVLWHOS : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst), + "evlwhos $RT, $dst", IIC_VecFP>; +def EVLWWSPLAT : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst), + "evlwwsplat $RT, $dst", IIC_VecFP>; +def EVLWHSPLAT : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst), + "evlwhsplat $RT, $dst", IIC_VecFP>; + +def EVSTDD : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst), + "evstdd $RT, $dst", IIC_VecFP>; +def EVSTDH : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst), + "evstdh $RT, $dst", IIC_VecFP>; +def EVSTDW : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst), + "evstdw $RT, $dst", IIC_VecFP>; +def EVSTWHE : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst), + "evstwhe $RT, $dst", IIC_VecFP>; +def EVSTWHO : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst), + "evstwho $RT, $dst", IIC_VecFP>; +def EVSTWWE : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst), + "evstwwe $RT, $dst", IIC_VecFP>; +def EVSTWWO : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst), + "evstwwo $RT, $dst", IIC_VecFP>; + +def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA), + "evmra $RT, $RA", IIC_VecFP> { + let RB = 0; +} + +def BRINC : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "brinc $RT, $RA, $RB", IIC_VecFP>; +def EVABS : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA), + "evabs $RT, $RA", IIC_VecFP>; + +def EVADDIW : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB), + "evaddiw $RT, $RB, $RA", IIC_VecFP>; +def EVADDSMIAAW : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA), + "evaddsmiaaw $RT, $RA", IIC_VecFP>; +def EVADDSSIAAW : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA), + "evaddssiaaw $RT, $RA", IIC_VecFP>; +def EVADDUSIAAW : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA), + "evaddusiaaw $RT, $RA", IIC_VecFP>; +def EVADDUMIAAW : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA), + "evaddumiaaw $RT, $RA", IIC_VecFP>; +def EVADDW : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evaddw $RT, $RA, $RB", IIC_VecFP>; + +def EVAND : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evand $RT, $RA, $RB", IIC_VecFP>; +def EVANDC : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evandc $RT, $RA, $RB", IIC_VecFP>; + +def EVCMPEQ : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB), + "evcmpeq $crD, $RA, $RB", IIC_VecFP>; +def EVCMPGTS : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB), + "evcmpgts $crD, $RA, $RB", IIC_VecFP>; +def EVCMPGTU : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB), + "evcmpgtu $crD, $RA, $RB", IIC_VecFP>; +def EVCMPLTS : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB), + "evcmplts $crD, $RA, $RB", IIC_VecFP>; +def EVCMPLTU : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB), + "evcmpltu $crD, $RA, $RB", IIC_VecFP>; + +def EVCNTLSW : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA), + "evcntlsw $RT, $RA", IIC_VecFP>; +def EVCNTLZW : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA), + "evcntlzw $RT, $RA", IIC_VecFP>; + +def EVDIVWS : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evdivws $RT, $RA, $RB", IIC_VecFP>; +def EVDIVWU : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evdivwu $RT, $RA, $RB", IIC_VecFP>; + +def EVEQV : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "eveqv $RT, $RA, $RB", IIC_VecFP>; + +def EVEXTSB : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA), + "evextsb $RT, $RA", IIC_VecFP>; +def EVEXTSH : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA), + "evextsh $RT, $RA", IIC_VecFP>; + +def EVLDDX : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlddx $RT, $RA, $RB", IIC_VecFP>; +def EVLDWX : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evldwx $RT, $RA, $RB", IIC_VecFP>; +def EVLDHX : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evldhx $RT, $RA, $RB", IIC_VecFP>; +def EVLHHESPLATX : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>; +def EVLHHOUSPLATX : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>; +def EVLHHOSSPLATX : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>; +def EVLWHEX : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlwhex $RT, $RA, $RB", IIC_VecFP>; +def EVLWHOUX : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlwhoux $RT, $RA, $RB", IIC_VecFP>; +def EVLWHOSX : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlwhosx $RT, $RA, $RB", IIC_VecFP>; +def EVLWWSPLATX : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>; +def EVLWHSPLATX : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>; + +def EVMERGEHI : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmergehi $RT, $RA, $RB", IIC_VecFP>; +def EVMERGELO : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmergelo $RT, $RA, $RB", IIC_VecFP>; +def EVMERGEHILO : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmergehilo $RT, $RA, $RB", IIC_VecFP>; +def EVMERGELOHI : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmergelohi $RT, $RA, $RB", IIC_VecFP>; + +def EVMHEGSMFAA : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHEGSMFAN : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>; +def EVMHEGSMIAA : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHEGSMIAN : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegsmian $RT, $RA, $RB", IIC_VecFP>; +def EVMHEGUMIAA : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHEGUMIAN : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhegumian $RT, $RA, $RB", IIC_VecFP>; + +def EVMHESMF : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmf $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMFA : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmfa $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMFAAW : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMFANW : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMI : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmi $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMIA : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmia $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMIAAW : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESMIANW : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhesmianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSF : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessf $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSFA : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessfa $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSFAAW : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSFANW : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessfanw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSIAAW : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHESSIANW : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhessianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUMI : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheumi $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUMIA : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheumia $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUMIAAW : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUMIANW : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheumianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUSIAAW : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHEUSIANW : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmheusianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGSMFAA : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGSMFAN : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGSMIAA : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGSMIAN : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogsmian $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGUMIAA : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMHOGUMIAN : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhogumian $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMF : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmf $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMFA : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmfa $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMFAAW : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMFANW : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMI : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmi $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMIA : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmia $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMIAAW : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSMIANW : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhosmianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSF : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossf $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSFA : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossfa $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSFAAW : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSFANW : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossfanw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSIAAW : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOSSIANW : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhossianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUMI : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhoumi $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUMIA : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhoumia $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUMIAAW : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUMIANW : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhoumianw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUSIAAW : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMHOUSIANW : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmhousianw $RT, $RA, $RB", IIC_VecFP>; + + +def EVMWHSMF : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhsmf $RT, $RA, $RB", IIC_VecFP>; +def EVMWHSMFA : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>; +def EVMWHSMI : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhsmi $RT, $RA, $RB", IIC_VecFP>; +def EVMWHSMIA : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhsmia $RT, $RA, $RB", IIC_VecFP>; +def EVMWHSSF : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhssf $RT, $RA, $RB", IIC_VecFP>; +def EVMWHSSFA : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhssfa $RT, $RA, $RB", IIC_VecFP>; +def EVMWHUMI : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhumi $RT, $RA, $RB", IIC_VecFP>; +def EVMWHUMIA : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwhumia $RT, $RA, $RB", IIC_VecFP>; +def EVMWLSMIAAW : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLSMIANW : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLSSIAAW : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLSSIANW : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlssianw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUMI : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlumi $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUMIA : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlumia $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUMIAAW : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUMIANW : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlumianw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUSIAAW : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>; +def EVMWLUSIANW : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwlusianw $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMF : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmf $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMFA : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmfa $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMFAA : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMFAN : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmfan $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMI : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmi $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMIA : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmia $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMIAA : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMWSMIAN : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwsmian $RT, $RA, $RB", IIC_VecFP>; +def EVMWSSF : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwssf $RT, $RA, $RB", IIC_VecFP>; +def EVMWSSFA : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwssfa $RT, $RA, $RB", IIC_VecFP>; +def EVMWSSFAA : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwssfaa $RT, $RA, $RB", IIC_VecFP>; +def EVMWSSFAN : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwssfan $RT, $RA, $RB", IIC_VecFP>; +def EVMWUMI : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwumi $RT, $RA, $RB", IIC_VecFP>; +def EVMWUMIA : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwumia $RT, $RA, $RB", IIC_VecFP>; +def EVMWUMIAA : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwumiaa $RT, $RA, $RB", IIC_VecFP>; +def EVMWUMIAN : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evmwumian $RT, $RA, $RB", IIC_VecFP>; + + +def EVNAND : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evnand $RT, $RA, $RB", IIC_VecFP>; + +def EVNEG : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA), + "evneg $RT, $RA", IIC_VecFP>; + +def EVNOR : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evnor $RT, $RA, $RB", IIC_VecFP>; +def EVOR : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evor $RT, $RA, $RB", IIC_VecFP>; +def EVORC : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evorc $RT, $RA, $RB", IIC_VecFP>; + +def EVRLWI : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB), + "evrlwi $RT, $RA, $RB", IIC_VecFP>; +def EVRLW : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evrlw $RT, $RA, $RB", IIC_VecFP>; + +def EVRNDW : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA), + "evrndw $RT, $RA", IIC_VecFP>; + +def EVSLWI : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB), + "evslwi $RT, $RA, $RB", IIC_VecFP>; +def EVSLW : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evslw $RT, $RA, $RB", IIC_VecFP>; + +def EVSPLATFI : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA), + "evsplatfi $RT, $RA", IIC_VecFP>; +def EVSPLATI : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA), + "evsplati $RT, $RA", IIC_VecFP>; + +def EVSRWIS : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB), + "evsrwis $RT, $RA, $RB", IIC_VecFP>; +def EVSRWIU : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB), + "evsrwiu $RT, $RA, $RB", IIC_VecFP>; +def EVSRWS : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evsrws $RT, $RA, $RB", IIC_VecFP>; +def EVSRWU : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evsrwu $RT, $RA, $RB", IIC_VecFP>; + +def EVSTDDX : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstddx $RT, $RA, $RB", IIC_VecFP>; +def EVSTDHX : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstdhx $RT, $RA, $RB", IIC_VecFP>; +def EVSTDWX : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstdwx $RT, $RA, $RB", IIC_VecFP>; +def EVSTWHEX : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstwhex $RT, $RA, $RB", IIC_VecFP>; +def EVSTWHOX : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstwhox $RT, $RA, $RB", IIC_VecFP>; +def EVSTWWEX : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstwwex $RT, $RA, $RB", IIC_VecFP>; +def EVSTWWOX : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB), + "evstwwox $RT, $RA, $RB", IIC_VecFP>; + +def EVSUBFSSIAAW : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA), + "evsubfssiaaw $RT, $RA", IIC_VecFP>; +def EVSUBFSMIAAW : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA), + "evsubfsmiaaw $RT, $RA", IIC_VecFP>; +def EVSUBFUMIAAW : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA), + "evsubfumiaaw $RT, $RA", IIC_VecFP>; +def EVSUBFUSIAAW : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA), + "evsubfusiaaw $RT, $RA", IIC_VecFP>; +def EVSUBFW : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evsubfw $RT, $RA, $RB", IIC_VecFP>; +def EVSUBIFW : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB), + "evsubifw $RT, $RA, $RB", IIC_VecFP>; +def EVXOR : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), + "evxor $RT, $RA, $RB", IIC_VecFP>; + +} // HasSPE diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td new file mode 100644 index 0000000..df1142c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -0,0 +1,1785 @@ +//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the VSX extension to the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +// *********************************** NOTE *********************************** +// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing ** +// ** which VMX and VSX instructions are lane-sensitive and which are not. ** +// ** A lane-sensitive instruction relies, implicitly or explicitly, on ** +// ** whether lanes are numbered from left to right. An instruction like ** +// ** VADDFP is not lane-sensitive, because each lane of the result vector ** +// ** relies only on the corresponding lane of the source vectors. However, ** +// ** an instruction like VMULESB is lane-sensitive, because "even" and ** +// ** "odd" lanes are different for big-endian and little-endian numbering. ** +// ** ** +// ** When adding new VMX and VSX instructions, please consider whether they ** +// ** are lane-sensitive. If so, they must be added to a switch statement ** +// ** in PPCVSXSwapRemoval::gatherVectorInstructions(). ** +// **************************************************************************** + +def PPCRegVSRCAsmOperand : AsmOperandClass { + let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber"; +} +def vsrc : RegisterOperand<VSRC> { + let ParserMatchClass = PPCRegVSRCAsmOperand; +} + +def PPCRegVSFRCAsmOperand : AsmOperandClass { + let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber"; +} +def vsfrc : RegisterOperand<VSFRC> { + let ParserMatchClass = PPCRegVSFRCAsmOperand; +} + +def PPCRegVSSRCAsmOperand : AsmOperandClass { + let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber"; +} +def vssrc : RegisterOperand<VSSRC> { + let ParserMatchClass = PPCRegVSSRCAsmOperand; +} + +// Little-endian-specific nodes. +def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2f64>, SDTCisPtrTy<1> +]>; +def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [ + SDTCisVT<0, v2f64>, SDTCisPtrTy<1> +]>; +def SDT_PPCxxswapd : SDTypeProfile<1, 1, [ + SDTCisSameAs<0, 1> +]>; + +def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, + [SDNPHasChain, SDNPMayLoad]>; +def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, + [SDNPHasChain, SDNPMayStore]>; +def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; +def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; +def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; +def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; + +multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, + string asmstr, InstrItinClass itin, Intrinsic Int, + ValueType OutTy, ValueType InTy> { + let BaseName = asmbase in { + def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>; + let Defs = [CR6] in + def o : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + [(set InTy:$XT, + (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>, + isDOT; + } +} + +def HasVSX : Predicate<"PPCSubTarget->hasVSX()">; +def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">; +def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">; + +let Predicates = [HasVSX] in { +let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. +let hasSideEffects = 0 in { // VSX instructions don't have side effects. +let Uses = [RM] in { + + // Load indexed instructions + let mayLoad = 1 in { + def LXSDX : XX1Form<31, 588, + (outs vsfrc:$XT), (ins memrr:$src), + "lxsdx $XT, $src", IIC_LdStLFD, + [(set f64:$XT, (load xoaddr:$src))]>; + + def LXVD2X : XX1Form<31, 844, + (outs vsrc:$XT), (ins memrr:$src), + "lxvd2x $XT, $src", IIC_LdStLFD, + [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>; + + def LXVDSX : XX1Form<31, 332, + (outs vsrc:$XT), (ins memrr:$src), + "lxvdsx $XT, $src", IIC_LdStLFD, []>; + + def LXVW4X : XX1Form<31, 780, + (outs vsrc:$XT), (ins memrr:$src), + "lxvw4x $XT, $src", IIC_LdStLFD, + [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>; + } // mayLoad + + // Store indexed instructions + let mayStore = 1 in { + def STXSDX : XX1Form<31, 716, + (outs), (ins vsfrc:$XT, memrr:$dst), + "stxsdx $XT, $dst", IIC_LdStSTFD, + [(store f64:$XT, xoaddr:$dst)]>; + + def STXVD2X : XX1Form<31, 972, + (outs), (ins vsrc:$XT, memrr:$dst), + "stxvd2x $XT, $dst", IIC_LdStSTFD, + [(store v2f64:$XT, xoaddr:$dst)]>; + + def STXVW4X : XX1Form<31, 908, + (outs), (ins vsrc:$XT, memrr:$dst), + "stxvw4x $XT, $dst", IIC_LdStSTFD, + [(store v4i32:$XT, xoaddr:$dst)]>; + + } // mayStore + + // Add/Mul Instructions + let isCommutable = 1 in { + def XSADDDP : XX3Form<60, 32, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsadddp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>; + def XSMULDP : XX3Form<60, 48, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmuldp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>; + + def XVADDDP : XX3Form<60, 96, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvadddp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>; + + def XVADDSP : XX3Form<60, 64, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvaddsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>; + + def XVMULDP : XX3Form<60, 112, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmuldp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>; + + def XVMULSP : XX3Form<60, 80, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>; + } + + // Subtract Instructions + def XSSUBDP : XX3Form<60, 40, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xssubdp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>; + + def XVSUBDP : XX3Form<60, 104, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubdp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>; + def XVSUBSP : XX3Form<60, 72, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>; + + // FMA Instructions + let BaseName = "XSMADDADP" in { + let isCommutable = 1 in + def XSMADDADP : XX3Form<60, 33, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMADDMDP : XX3Form<60, 41, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSMSUBADP" in { + let isCommutable = 1 in + def XSMSUBADP : XX3Form<60, 49, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMSUBMDP : XX3Form<60, 57, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMADDADP" in { + let isCommutable = 1 in + def XSNMADDADP : XX3Form<60, 161, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMADDMDP : XX3Form<60, 169, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMSUBADP" in { + let isCommutable = 1 in + def XSNMSUBADP : XX3Form<60, 177, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMSUBMDP : XX3Form<60, 185, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMADDADP" in { + let isCommutable = 1 in + def XVMADDADP : XX3Form<60, 97, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMADDMDP : XX3Form<60, 105, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMADDASP" in { + let isCommutable = 1 in + def XVMADDASP : XX3Form<60, 65, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMADDMSP : XX3Form<60, 73, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMSUBADP" in { + let isCommutable = 1 in + def XVMSUBADP : XX3Form<60, 113, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMSUBMDP : XX3Form<60, 121, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMSUBASP" in { + let isCommutable = 1 in + def XVMSUBASP : XX3Form<60, 81, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMSUBMSP : XX3Form<60, 89, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMADDADP" in { + let isCommutable = 1 in + def XVNMADDADP : XX3Form<60, 225, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMADDMDP : XX3Form<60, 233, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMADDASP" in { + let isCommutable = 1 in + def XVNMADDASP : XX3Form<60, 193, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMADDMSP : XX3Form<60, 201, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMSUBADP" in { + let isCommutable = 1 in + def XVNMSUBADP : XX3Form<60, 241, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMSUBMDP : XX3Form<60, 249, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMSUBASP" in { + let isCommutable = 1 in + def XVNMSUBASP : XX3Form<60, 209, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMSUBMSP : XX3Form<60, 217, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + // Division Instructions + def XSDIVDP : XX3Form<60, 56, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsdivdp $XT, $XA, $XB", IIC_FPDivD, + [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>; + def XSSQRTDP : XX2Form<60, 75, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xssqrtdp $XT, $XB", IIC_FPSqrtD, + [(set f64:$XT, (fsqrt f64:$XB))]>; + + def XSREDP : XX2Form<60, 90, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsredp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfre f64:$XB))]>; + def XSRSQRTEDP : XX2Form<60, 74, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrsqrtedp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfrsqrte f64:$XB))]>; + + def XSTDIVDP : XX3Form_1<60, 61, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; + def XSTSQRTDP : XX2Form_1<60, 106, + (outs crrc:$crD), (ins vsfrc:$XB), + "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; + + def XVDIVDP : XX3Form<60, 120, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvdivdp $XT, $XA, $XB", IIC_FPDivD, + [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>; + def XVDIVSP : XX3Form<60, 88, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvdivsp $XT, $XA, $XB", IIC_FPDivS, + [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>; + + def XVSQRTDP : XX2Form<60, 203, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvsqrtdp $XT, $XB", IIC_FPSqrtD, + [(set v2f64:$XT, (fsqrt v2f64:$XB))]>; + def XVSQRTSP : XX2Form<60, 139, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvsqrtsp $XT, $XB", IIC_FPSqrtS, + [(set v4f32:$XT, (fsqrt v4f32:$XB))]>; + + def XVTDIVDP : XX3Form_1<60, 125, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; + def XVTDIVSP : XX3Form_1<60, 93, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>; + + def XVTSQRTDP : XX2Form_1<60, 234, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>; + def XVTSQRTSP : XX2Form_1<60, 170, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>; + + def XVREDP : XX2Form<60, 218, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvredp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (PPCfre v2f64:$XB))]>; + def XVRESP : XX2Form<60, 154, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvresp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (PPCfre v4f32:$XB))]>; + + def XVRSQRTEDP : XX2Form<60, 202, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrsqrtedp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>; + def XVRSQRTESP : XX2Form<60, 138, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrsqrtesp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>; + + // Compare Instructions + def XSCMPODP : XX3Form_1<60, 43, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>; + def XSCMPUDP : XX3Form_1<60, 35, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>; + + defm XVCMPEQDP : XX3Form_Rcr<60, 99, + "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>; + defm XVCMPEQSP : XX3Form_Rcr<60, 67, + "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>; + defm XVCMPGEDP : XX3Form_Rcr<60, 115, + "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpgedp, v2i64, v2f64>; + defm XVCMPGESP : XX3Form_Rcr<60, 83, + "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpgesp, v4i32, v4f32>; + defm XVCMPGTDP : XX3Form_Rcr<60, 107, + "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>; + defm XVCMPGTSP : XX3Form_Rcr<60, 75, + "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, + int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>; + + // Move Instructions + def XSABSDP : XX2Form<60, 345, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsabsdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fabs f64:$XB))]>; + def XSNABSDP : XX2Form<60, 361, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsnabsdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fabs f64:$XB)))]>; + def XSNEGDP : XX2Form<60, 377, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsnegdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fneg f64:$XB))]>; + def XSCPSGNDP : XX3Form<60, 176, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xscpsgndp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>; + + def XVABSDP : XX2Form<60, 473, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvabsdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fabs v2f64:$XB))]>; + + def XVABSSP : XX2Form<60, 409, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvabssp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fabs v4f32:$XB))]>; + + def XVCPSGNDP : XX3Form<60, 240, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcpsgndp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>; + def XVCPSGNSP : XX3Form<60, 208, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>; + + def XVNABSDP : XX2Form<60, 489, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnabsdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>; + def XVNABSSP : XX2Form<60, 425, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnabssp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>; + + def XVNEGDP : XX2Form<60, 505, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnegdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg v2f64:$XB))]>; + def XVNEGSP : XX2Form<60, 441, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnegsp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg v4f32:$XB))]>; + + // Conversion Instructions + def XSCVDPSP : XX2Form<60, 265, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsp $XT, $XB", IIC_VecFP, []>; + def XSCVDPSXDS : XX2Form<60, 344, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsxds $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctidz f64:$XB))]>; + def XSCVDPSXWS : XX2Form<60, 88, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsxws $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiwz f64:$XB))]>; + def XSCVDPUXDS : XX2Form<60, 328, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpuxds $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiduz f64:$XB))]>; + def XSCVDPUXWS : XX2Form<60, 72, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpuxws $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiwuz f64:$XB))]>; + def XSCVSPDP : XX2Form<60, 329, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvspdp $XT, $XB", IIC_VecFP, []>; + def XSCVSXDDP : XX2Form<60, 376, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvsxddp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfcfid f64:$XB))]>; + def XSCVUXDDP : XX2Form<60, 360, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvuxddp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfcfidu f64:$XB))]>; + + def XVCVDPSP : XX2Form<60, 393, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsp $XT, $XB", IIC_VecFP, []>; + def XVCVDPSXDS : XX2Form<60, 472, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsxds $XT, $XB", IIC_VecFP, + [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>; + def XVCVDPSXWS : XX2Form<60, 216, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsxws $XT, $XB", IIC_VecFP, []>; + def XVCVDPUXDS : XX2Form<60, 456, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpuxds $XT, $XB", IIC_VecFP, + [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>; + def XVCVDPUXWS : XX2Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpuxws $XT, $XB", IIC_VecFP, []>; + + def XVCVSPDP : XX2Form<60, 457, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspdp $XT, $XB", IIC_VecFP, []>; + def XVCVSPSXDS : XX2Form<60, 408, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspsxds $XT, $XB", IIC_VecFP, []>; + def XVCVSPSXWS : XX2Form<60, 152, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspsxws $XT, $XB", IIC_VecFP, []>; + def XVCVSPUXDS : XX2Form<60, 392, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspuxds $XT, $XB", IIC_VecFP, []>; + def XVCVSPUXWS : XX2Form<60, 136, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspuxws $XT, $XB", IIC_VecFP, []>; + def XVCVSXDDP : XX2Form<60, 504, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxddp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>; + def XVCVSXDSP : XX2Form<60, 440, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxdsp $XT, $XB", IIC_VecFP, []>; + def XVCVSXWDP : XX2Form<60, 248, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxwdp $XT, $XB", IIC_VecFP, []>; + def XVCVSXWSP : XX2Form<60, 184, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxwsp $XT, $XB", IIC_VecFP, []>; + def XVCVUXDDP : XX2Form<60, 488, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxddp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>; + def XVCVUXDSP : XX2Form<60, 424, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxdsp $XT, $XB", IIC_VecFP, []>; + def XVCVUXWDP : XX2Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxwdp $XT, $XB", IIC_VecFP, []>; + def XVCVUXWSP : XX2Form<60, 168, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxwsp $XT, $XB", IIC_VecFP, []>; + + // Rounding Instructions + def XSRDPI : XX2Form<60, 73, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpi $XT, $XB", IIC_VecFP, + [(set f64:$XT, (frnd f64:$XB))]>; + def XSRDPIC : XX2Form<60, 107, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpic $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fnearbyint f64:$XB))]>; + def XSRDPIM : XX2Form<60, 121, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpim $XT, $XB", IIC_VecFP, + [(set f64:$XT, (ffloor f64:$XB))]>; + def XSRDPIP : XX2Form<60, 105, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpip $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fceil f64:$XB))]>; + def XSRDPIZ : XX2Form<60, 89, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpiz $XT, $XB", IIC_VecFP, + [(set f64:$XT, (ftrunc f64:$XB))]>; + + def XVRDPI : XX2Form<60, 201, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpi $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (frnd v2f64:$XB))]>; + def XVRDPIC : XX2Form<60, 235, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpic $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; + def XVRDPIM : XX2Form<60, 249, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpim $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (ffloor v2f64:$XB))]>; + def XVRDPIP : XX2Form<60, 233, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpip $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fceil v2f64:$XB))]>; + def XVRDPIZ : XX2Form<60, 217, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpiz $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (ftrunc v2f64:$XB))]>; + + def XVRSPI : XX2Form<60, 137, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspi $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (frnd v4f32:$XB))]>; + def XVRSPIC : XX2Form<60, 171, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspic $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; + def XVRSPIM : XX2Form<60, 185, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspim $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (ffloor v4f32:$XB))]>; + def XVRSPIP : XX2Form<60, 169, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspip $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fceil v4f32:$XB))]>; + def XVRSPIZ : XX2Form<60, 153, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspiz $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (ftrunc v4f32:$XB))]>; + + // Max/Min Instructions + let isCommutable = 1 in { + def XSMAXDP : XX3Form<60, 160, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; + def XSMINDP : XX3Form<60, 168, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; + + def XVMAXDP : XX3Form<60, 224, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; + def XVMINDP : XX3Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; + + def XVMAXSP : XX3Form<60, 192, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; + def XVMINSP : XX3Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvminsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; + } // isCommutable +} // Uses = [RM] + + // Logical Instructions + let isCommutable = 1 in + def XXLAND : XX3Form<60, 130, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxland $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>; + def XXLANDC : XX3Form<60, 138, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlandc $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (and v4i32:$XA, + (vnot_ppc v4i32:$XB)))]>; + let isCommutable = 1 in { + def XXLNOR : XX3Form<60, 162, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlnor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA, + v4i32:$XB)))]>; + def XXLOR : XX3Form<60, 146, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>; + let isCodeGenOnly = 1 in + def XXLORf: XX3Form<60, 146, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>; + def XXLXOR : XX3Form<60, 154, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlxor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>; + } // isCommutable + + // Permutation Instructions + def XXMRGHW : XX3Form<60, 18, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>; + def XXMRGLW : XX3Form<60, 50, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>; + + def XXPERMDI : XX3Form_2<60, 10, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + def XXSEL : XX4Form<60, 3, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC), + "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>; + + def XXSLDWI : XX3Form_2<60, 2, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW), + "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>; + def XXSPLTW : XX2Form_2<60, 164, + (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM), + "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; +} // hasSideEffects + +// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after +// instruction selection into a branch sequence. +let usesCustomInserter = 1, // Expanded after instruction selection. + PPC970_Single = 1 in { + + def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst), + (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC), + "#SELECT_CC_VSRC", + []>; + def SELECT_VSRC: Pseudo<(outs vsrc:$dst), + (ins crbitrc:$cond, vsrc:$T, vsrc:$F), + "#SELECT_VSRC", + [(set v2f64:$dst, + (select i1:$cond, v2f64:$T, v2f64:$F))]>; + def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst), + (ins crrc:$cond, f8rc:$T, f8rc:$F, + i32imm:$BROPC), "#SELECT_CC_VSFRC", + []>; + def SELECT_VSFRC: Pseudo<(outs f8rc:$dst), + (ins crbitrc:$cond, f8rc:$T, f8rc:$F), + "#SELECT_VSFRC", + [(set f64:$dst, + (select i1:$cond, f64:$T, f64:$F))]>; + def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst), + (ins crrc:$cond, f4rc:$T, f4rc:$F, + i32imm:$BROPC), "#SELECT_CC_VSSRC", + []>; + def SELECT_VSSRC: Pseudo<(outs f4rc:$dst), + (ins crbitrc:$cond, f4rc:$T, f4rc:$F), + "#SELECT_VSSRC", + [(set f32:$dst, + (select i1:$cond, f32:$T, f32:$F))]>; +} // usesCustomInserter +} // AddedComplexity + +def : InstAlias<"xvmovdp $XT, $XB", + (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; +def : InstAlias<"xvmovsp $XT, $XB", + (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; + +def : InstAlias<"xxspltd $XT, $XB, 0", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>; +def : InstAlias<"xxspltd $XT, $XB, 1", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>; +def : InstAlias<"xxmrghd $XT, $XA, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>; +def : InstAlias<"xxmrgld $XT, $XA, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>; +def : InstAlias<"xxswapd $XT, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>; + +let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. + +let Predicates = [IsBigEndian] in { +def : Pat<(v2f64 (scalar_to_vector f64:$A)), + (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; + +def : Pat<(f64 (extractelt v2f64:$S, 0)), + (f64 (EXTRACT_SUBREG $S, sub_64))>; +def : Pat<(f64 (extractelt v2f64:$S, 1)), + (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; +} + +let Predicates = [IsLittleEndian] in { +def : Pat<(v2f64 (scalar_to_vector f64:$A)), + (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64), + (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>; + +def : Pat<(f64 (extractelt v2f64:$S, 0)), + (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; +def : Pat<(f64 (extractelt v2f64:$S, 1)), + (f64 (EXTRACT_SUBREG $S, sub_64))>; +} + +// Additional fnmsub patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), + (XSNMSUBADP $B, $C, $A)>; +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), + (XSNMSUBADP $B, $C, $A)>; + +def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B), + (XVNMSUBADP $B, $C, $A)>; +def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B), + (XVNMSUBADP $B, $C, $A)>; + +def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), + (XVNMSUBASP $B, $C, $A)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), + (XVNMSUBASP $B, $C, $A)>; + +def : Pat<(v2f64 (bitconvert v4f32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v4i32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v8i16:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v16i8:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; + +def : Pat<(v4f32 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v4i32 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v8i16 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v16i8 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +def : Pat<(v2i64 (bitconvert v4f32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v4i32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v8i16:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v16i8:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; + +def : Pat<(v4f32 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v4i32 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v8i16 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v16i8 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +def : Pat<(v2f64 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v2i64 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +def : Pat<(v2f64 (bitconvert v1i128:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v1i128 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +// sign extension patterns +// To extend "in place" from v2i32 to v2i64, we have input data like: +// | undef | i32 | undef | i32 | +// but xvcvsxwdp expects the input in big-Endian format: +// | i32 | undef | i32 | undef | +// so we need to shift everything to the left by one i32 (word) before +// the conversion. +def : Pat<(sext_inreg v2i64:$C, v2i32), + (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>; +def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), + (XVCVSXWDP (XXSLDWI $C, $C, 1))>; + +// Loads. +def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; +def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; +def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; +def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>; + +// Stores. +def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), + (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), + (STXVW4X $rS, xoaddr:$dst)>; +def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + +// Permutes. +def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; + +// Selects. +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), + (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)), + (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)), + (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)), + (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)), + (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)), + (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)), + (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)), + (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)), + (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)), + (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)), + (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)), + (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)), + (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)), + (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)), + (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)), + (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)), + (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)), + (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)), + (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), + (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +// Divides. +def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B), + (XVDIVSP $A, $B)>; +def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B), + (XVDIVDP $A, $B)>; + +// Reciprocal estimate +def : Pat<(int_ppc_vsx_xvresp v4f32:$A), + (XVRESP $A)>; +def : Pat<(int_ppc_vsx_xvredp v2f64:$A), + (XVREDP $A)>; + +// Recip. square root estimate +def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A), + (XVRSQRTESP $A)>; +def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A), + (XVRSQRTEDP $A)>; + +} // AddedComplexity +} // HasVSX + +// The following VSX instructions were introduced in Power ISA 2.07 +/* FIXME: if the operands are v2i64, these patterns will not match. + we should define new patterns or otherwise match the same patterns + when the elements are larger than i32. +*/ +def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">; +def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">; +let Predicates = [HasP8Vector] in { +let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. + let isCommutable = 1 in { + def XXLEQV : XX3Form<60, 186, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxleqv $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>; + def XXLNAND : XX3Form<60, 178, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlnand $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA, + v4i32:$XB)))]>; + } // isCommutable + + def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B), + (XXLEQV $A, $B)>; + + def XXLORC : XX3Form<60, 170, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlorc $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>; + + // VSX scalar loads introduced in ISA 2.07 + let mayLoad = 1 in { + def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src), + "lxsspx $XT, $src", IIC_LdStLFD, + [(set f32:$XT, (load xoaddr:$src))]>; + def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src), + "lxsiwax $XT, $src", IIC_LdStLFD, + [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; + def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src), + "lxsiwzx $XT, $src", IIC_LdStLFD, + [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; + } // mayLoad + + // VSX scalar stores introduced in ISA 2.07 + let mayStore = 1 in { + def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst), + "stxsspx $XT, $dst", IIC_LdStSTFD, + [(store f32:$XT, xoaddr:$dst)]>; + def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst), + "stxsiwx $XT, $dst", IIC_LdStSTFD, + [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; + } // mayStore + + def : Pat<(f64 (extloadf32 xoaddr:$src)), + (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>; + def : Pat<(f64 (fextend f32:$src)), + (COPY_TO_REGCLASS $src, VSFRC)>; + + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), + (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)), + (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)), + (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)), + (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)), + (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)), + (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)), + (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)), + (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)), + (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>; + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), + (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>; + + // VSX Elementary Scalar FP arithmetic (SP) + let isCommutable = 1 in { + def XSADDSP : XX3Form<60, 0, + (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), + "xsaddsp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fadd f32:$XA, f32:$XB))]>; + def XSMULSP : XX3Form<60, 16, + (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), + "xsmulsp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>; + } // isCommutable + + def XSDIVSP : XX3Form<60, 24, + (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), + "xsdivsp $XT, $XA, $XB", IIC_FPDivS, + [(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>; + def XSRESP : XX2Form<60, 26, + (outs vssrc:$XT), (ins vssrc:$XB), + "xsresp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfre f32:$XB))]>; + def XSSQRTSP : XX2Form<60, 11, + (outs vssrc:$XT), (ins vssrc:$XB), + "xssqrtsp $XT, $XB", IIC_FPSqrtS, + [(set f32:$XT, (fsqrt f32:$XB))]>; + def XSRSQRTESP : XX2Form<60, 10, + (outs vssrc:$XT), (ins vssrc:$XB), + "xsrsqrtesp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfrsqrte f32:$XB))]>; + def XSSUBSP : XX3Form<60, 8, + (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), + "xssubsp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>; + + // FMA Instructions + let BaseName = "XSMADDASP" in { + let isCommutable = 1 in + def XSMADDASP : XX3Form<60, 1, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMADDMSP : XX3Form<60, 9, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSMSUBASP" in { + let isCommutable = 1 in + def XSMSUBASP : XX3Form<60, 17, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fma f32:$XA, f32:$XB, + (fneg f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMSUBMSP : XX3Form<60, 25, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMADDASP" in { + let isCommutable = 1 in + def XSNMADDASP : XX3Form<60, 129, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsnmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB, + f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMADDMSP : XX3Form<60, 137, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMSUBASP" in { + let isCommutable = 1 in + def XSNMSUBASP : XX3Form<60, 145, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsnmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB, + (fneg f32:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMSUBMSP : XX3Form<60, 153, + (outs vssrc:$XT), + (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), + "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + // Single Precision Conversions (FP <-> INT) + def XSCVSXDSP : XX2Form<60, 312, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvsxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfids f64:$XB))]>; + def XSCVUXDSP : XX2Form<60, 296, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvuxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfidus f64:$XB))]>; + + // Conversions between vector and scalar single precision + def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB), + "xscvdpspn $XT, $XB", IIC_VecFP, []>; + def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), + "xscvspdpn $XT, $XB", IIC_VecFP, []>; + +} // AddedComplexity = 400 +} // HasP8Vector + +let Predicates = [HasDirectMove, HasVSX] in { + // VSX direct move instructions + def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT), + "mfvsrd $rA, $XT", IIC_VecGeneral, + [(set i64:$rA, (PPCmfvsr f64:$XT))]>, + Requires<[In64BitMode]>; + def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT), + "mfvsrwz $rA, $XT", IIC_VecGeneral, + [(set i32:$rA, (PPCmfvsr f64:$XT))]>; + def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA), + "mtvsrd $XT, $rA", IIC_VecGeneral, + [(set f64:$XT, (PPCmtvsra i64:$rA))]>, + Requires<[In64BitMode]>; + def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA), + "mtvsrwa $XT, $rA", IIC_VecGeneral, + [(set f64:$XT, (PPCmtvsra i32:$rA))]>; + def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA), + "mtvsrwz $XT, $rA", IIC_VecGeneral, + [(set f64:$XT, (PPCmtvsrz i32:$rA))]>; +} // HasDirectMove, HasVSX + +/* Direct moves of various widths from GPR's into VSR's. Each move lines + the value up into element 0 (both BE and LE). Namely, entities smaller than + a doubleword are shifted left and moved for BE. For LE, they're moved, then + swapped to go into the least significant element of the VSR. +*/ +def MovesToVSR { + dag BE_BYTE_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7)); + dag BE_HALF_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15)); + dag BE_WORD_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31)); + dag BE_DWORD_0 = (MTVSRD $A); + + dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32)); + dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + LE_MTVSRW, sub_64)); + dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2); + dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + BE_DWORD_0, sub_64)); + dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2); +} + +/* Patterns for extracting elements out of vectors. Integer elements are + extracted using direct move operations. Patterns for extracting elements + whose indices are not available at compile time are also provided with + various _VARIABLE_ patterns. + The numbering for the DAG's is for LE, but when used on BE, the correct + LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13). +*/ +def VectorExtractions { + // Doubleword extraction + dag LE_DWORD_0 = + (MFVSRD + (EXTRACT_SUBREG + (XXPERMDI (COPY_TO_REGCLASS $S, VSRC), + (COPY_TO_REGCLASS $S, VSRC), 2), sub_64)); + dag LE_DWORD_1 = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + + // Word extraction + dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64)); + dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64)); + dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64)); + + // Halfword extraction + dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32)); + dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32)); + dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32)); + dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32)); + dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32)); + dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32)); + dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32)); + dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32)); + + // Byte extraction + dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32)); + dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32)); + dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32)); + dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32)); + dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32)); + dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32)); + dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32)); + dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32)); + dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32)); + dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32)); + dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32)); + dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32)); + dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32)); + dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32)); + dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32)); + dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32)); + + /* Variable element number (BE and LE patterns must be specified separately) + This is a rather involved process. + + Conceptually, this is how the move is accomplished: + 1. Identify which doubleword contains the element + 2. Shift in the VMX register so that the correct doubleword is correctly + lined up for the MFVSRD + 3. Perform the move so that the element (along with some extra stuff) + is in the GPR + 4. Right shift within the GPR so that the element is right-justified + + Of course, the index is an element number which has a different meaning + on LE/BE so the patterns have to be specified separately. + + Note: The final result will be the element right-justified with high + order bits being arbitrarily defined (namely, whatever was in the + vector register to the left of the value originally). + */ + + /* LE variable byte + Number 1. above: + - For elements 0-7, we shift left by 8 bytes since they're on the right + - For elements 8-15, we need not shift (shift left by zero bytes) + This is accomplished by inverting the bits of the index and AND-ing + with 0x8 (i.e. clearing all bits of the index and inverting bit 60). + */ + dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-7 (8-15 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 8 as we need to shift right by the number of bits, not bytes + - Shift right in the GPR by the calculated value + */ + dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60), + sub_32); + dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT), + sub_32); + + /* LE variable halfword + Number 1. above: + - For elements 0-3, we shift left by 8 since they're on the right + - For elements 4-7, we need not shift (shift left by zero bytes) + Similarly to the byte pattern, we invert the bits of the index, but we + AND with 0x4 (i.e. clear all bits of the index and invert bit 61). + Of course, the shift is still by 8 bytes, so we must multiply by 2. + */ + dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-3 (4-7 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 16 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59), + sub_32); + dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT), + sub_32); + + /* LE variable word + Number 1. above: + - For elements 0-1, we shift left by 8 since they're on the right + - For elements 2-3, we need not shift + */ + dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-1 (2-3 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 32 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58), + sub_32); + dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT), + sub_32); + + /* LE variable doubleword + Number 1. above: + - For element 0, we shift left by 8 since it's on the right + - For element 1, we need not shift + */ + dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + // - Number 4. is not needed for the doubleword as the value is 64-bits + dag LE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* LE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61)); + dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC); + dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE); + + /* LE variable double + Same as the LE doubleword except there is no move. + */ + dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + LE_VDWORD_PERM_VEC); + dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC); + + /* BE variable byte + The algorithm here is the same as the LE variable byte except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x8 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-7 + */ + dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8)); + dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC); + dag BE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)), + sub_64)); + dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60), + sub_32); + dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT), + sub_32); + + /* BE variable halfword + The algorithm here is the same as the LE variable halfword except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x4 and multiply by 2 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-3 + */ + dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62)); + dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC); + dag BE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)), + sub_64)); + dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59), + sub_32); + dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT), + sub_32); + + /* BE variable word + The algorithm is the same as the LE variable word except: + - The shift in the VMX register happens for opposite element numbers + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-1 + */ + dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61)); + dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC); + dag BE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)), + sub_64)); + dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58), + sub_32); + dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT), + sub_32); + + /* BE variable doubleword + Same as the LE doubleword except we shift in the VMX register for opposite + element indices. + */ + dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60)); + dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* BE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61)); + dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC); + dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE); + + /* BE variable double + Same as the BE doubleword except there is no move. + */ + dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); +} + +// v4f32 scalar <-> vector conversions (BE) +let Predicates = [IsBigEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XSCVDPSPN $A))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.BE_VARIABLE_FLOAT)>; +} // IsBigEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsBigEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>; + +let Predicates = [IsBigEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (BE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (BE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.BE_VARIABLE_DWORD)>; +} // IsBigEndian, HasDirectMove + +// v4f32 scalar <-> vector conversions (LE) +let Predicates = [IsLittleEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.LE_VARIABLE_FLOAT)>; +} // IsLittleEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsLittleEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>; + +let Predicates = [IsLittleEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (LE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 MovesToVSR.LE_WORD_0)>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 MovesToVSR.LE_DWORD_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (LE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.LE_VARIABLE_DWORD)>; +} // IsLittleEndian, HasDirectMove + +let Predicates = [HasDirectMove, HasVSX] in { +// bitconvert f32 -> i32 +// (convert to 32-bit fp single, shift right 1 word, move to GPR) +def : Pat<(i32 (bitconvert f32:$S)), + (i32 (MFVSRWZ (EXTRACT_SUBREG + (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3), + sub_64)))>; +// bitconvert i32 -> f32 +// (move to FPR, shift left 1 word, convert to 64-bit fp single) +def : Pat<(f32 (bitconvert i32:$A)), + (f32 (XSCVSPDPN + (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>; + +// bitconvert f64 -> i64 +// (move to GPR, nothing else needed) +def : Pat<(i64 (bitconvert f64:$S)), + (i64 (MFVSRD $S))>; + +// bitconvert i64 -> f64 +// (move to FPR, nothing else needed) +def : Pat<(f64 (bitconvert i64:$S)), + (f64 (MTVSRD $S))>; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp new file mode 100644 index 0000000..e3a35d5 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp @@ -0,0 +1,233 @@ +//===-------- PPCLoopDataPrefetch.cpp - Loop Data Prefetching Pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a Loop Data Prefetching Pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-loop-data-prefetch" +#include "PPC.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +using namespace llvm; + +// By default, we limit this to creating 16 PHIs (which is a little over half +// of the allocatable register set). +static cl::opt<bool> +PrefetchWrites("ppc-loop-prefetch-writes", cl::Hidden, cl::init(false), + cl::desc("Prefetch write addresses")); + +// This seems like a reasonable default for the BG/Q (this pass is enabled, by +// default, only on the BG/Q). +static cl::opt<unsigned> +PrefDist("ppc-loop-prefetch-distance", cl::Hidden, cl::init(300), + cl::desc("The loop prefetch distance")); + +static cl::opt<unsigned> +CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), + cl::desc("The loop prefetch cache line size")); + +namespace llvm { + void initializePPCLoopDataPrefetchPass(PassRegistry&); +} + +namespace { + + class PPCLoopDataPrefetch : public FunctionPass { + public: + static char ID; // Pass ID, replacement for typeid + PPCLoopDataPrefetch() : FunctionPass(ID) { + initializePPCLoopDataPrefetchPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + bool runOnFunction(Function &F) override; + bool runOnLoop(Loop *L); + + private: + AssumptionCache *AC; + LoopInfo *LI; + ScalarEvolution *SE; + const TargetTransformInfo *TTI; + const DataLayout *DL; + }; +} + +char PPCLoopDataPrefetch::ID = 0; +INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", + "PPC Loop Data Prefetch", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", + "PPC Loop Data Prefetch", false, false) + +FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPrefetch(); } + +bool PPCLoopDataPrefetch::runOnFunction(Function &F) { + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DL = &F.getParent()->getDataLayout(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool MadeChange = false; + + for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) + for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +bool PPCLoopDataPrefetch::runOnLoop(Loop *L) { + bool MadeChange = false; + + // Only prefetch in the inner-most loop + if (!L->empty()) + return MadeChange; + + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + // Calculate the number of iterations ahead to prefetch + CodeMetrics Metrics; + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + + // If the loop already has prefetches, then assume that the user knows + // what he or she is doing and don't add any more. + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); + J != JE; ++J) + if (CallInst *CI = dyn_cast<CallInst>(J)) + if (Function *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::prefetch) + return MadeChange; + + Metrics.analyzeBasicBlock(*I, *TTI, EphValues); + } + unsigned LoopSize = Metrics.NumInsts; + if (!LoopSize) + LoopSize = 1; + + unsigned ItersAhead = PrefDist/LoopSize; + if (!ItersAhead) + ItersAhead = 1; + + SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads; + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); + J != JE; ++J) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { + if (!PrefetchWrites) continue; + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec) + continue; + + // We don't want to double prefetch individual cache lines. If this load + // is known to be within one cache line of some other load that has + // already been prefetched, then don't prefetch this one as well. + bool DupPref = false; + for (SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, + 16>::iterator K = PrefLoads.begin(), KE = PrefLoads.end(); + K != KE; ++K) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, K->second); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast<SCEVConstant>(PtrDiff)) { + int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); + if (PD < (int64_t) CacheLineSize) { + DupPref = true; + break; + } + } + } + if (DupPref) + continue; + + const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( + SE->getConstant(LSCEVAddRec->getType(), ItersAhead), + LSCEVAddRec->getStepRecurrence(*SE))); + if (!isSafeToExpand(NextLSCEV, *SE)) + continue; + + PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); + + Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace); + SCEVExpander SCEVE(*SE, J->getModule()->getDataLayout(), "prefaddr"); + Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); + + IRBuilder<> Builder(MemI); + Module *M = (*I)->getParent()->getParent(); + Type *I32 = Type::getInt32Ty((*I)->getContext()); + Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch); + Builder.CreateCall( + PrefetchFunc, + {PrefPtrValue, + ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + + MadeChange = true; + } + } + + return MadeChange; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp new file mode 100644 index 0000000..5e18826 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -0,0 +1,437 @@ +//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to prepare loops for pre-increment addressing +// modes. Additional PHIs are created for loop induction variables used by +// load/store instructions so that the pre-increment forms can be used. +// Generically, this means transforming loops like this: +// for (int i = 0; i < n; ++i) +// array[i] = c; +// to look like this: +// T *p = array[-1]; +// for (int i = 0; i < n; ++i) +// *++p = c; +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-loop-preinc-prep" +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +using namespace llvm; + +// By default, we limit this to creating 16 PHIs (which is a little over half +// of the allocatable register set). +static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars", + cl::Hidden, cl::init(16), + cl::desc("Potential PHI threshold for PPC preinc loop prep")); + +namespace llvm { + void initializePPCLoopPreIncPrepPass(PassRegistry&); +} + +namespace { + + class PPCLoopPreIncPrep : public FunctionPass { + public: + static char ID; // Pass ID, replacement for typeid + PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) { + initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); + } + PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + } + + bool runOnFunction(Function &F) override; + + bool runOnLoop(Loop *L); + void simplifyLoopLatch(Loop *L); + bool rotateLoop(Loop *L); + + private: + PPCTargetMachine *TM; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + bool PreserveLCSSA; + }; +} + +char PPCLoopPreIncPrep::ID = 0; +static const char *name = "Prepare loop for pre-inc. addressing modes"; +INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) + +FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { + return new PPCLoopPreIncPrep(TM); +} + +namespace { + struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} + + const SCEVConstant *Offset; + Instruction *Instr; + }; + + struct Bucket { + Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), + Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector<BucketElement, 16> Elements; + }; +} + +static bool IsPtrInBounds(Value *BasePtr) { + Value *StrippedBasePtr = BasePtr; + while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr)) + StrippedBasePtr = BC->getOperand(0); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr)) + return GEP->isInBounds(); + + return false; +} + +static Value *GetPointerOperand(Value *MemI) { + if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) { + return LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) { + return SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) + return IMemI->getArgOperand(0); + } + + return 0; +} + +bool PPCLoopPreIncPrep::runOnFunction(Function &F) { + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + bool MadeChange = false; + + for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) + for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { + bool MadeChange = false; + + // Only prep. the inner-most loop + if (!L->empty()) + return MadeChange; + + DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); + + BasicBlock *Header = L->getHeader(); + + const PPCSubtarget *ST = + TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr; + + unsigned HeaderLoopPredCount = + std::distance(pred_begin(Header), pred_end(Header)); + + // Collect buckets of comparable addresses used by loads and stores. + SmallVector<Bucket, 16> Buckets; + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); + J != JE; ++J) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(0); + } else continue; + } else continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + // There are no update forms for Altivec vector load/stores. + if (ST && ST->hasAltivec() && + PtrValue->getType()->getPointerElementType()->isVectorTy()) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); + if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) { + if (LARSCEV->getLoop() != L) + continue; + } else { + continue; + } + + bool FoundBucket = false; + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; + } + } + + if (!FoundBucket) { + if (Buckets.size() == MaxVars) + return MadeChange; + Buckets.push_back(Bucket(LSCEV, MemI)); + } + } + } + + if (Buckets.empty()) + return MadeChange; + + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + // If there is no loop predecessor, or the loop predecessor's terminator + // returns a value (which might contribute to determining the loop's + // iteration space), insert a new preheader for the loop. + if (!LoopPredecessor || + !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + if (LoopPredecessor) + MadeChange = true; + } + if (!LoopPredecessor) + return MadeChange; + + DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n"); + + SmallSet<BasicBlock *, 16> BBChanged; + for (unsigned i = 0, e = Buckets.size(); i != e; ++i) { + // The base address of each bucket is transformed into a phi and the others + // are rewritten as offsets of that variable. + + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!Buckets[i].Elements[j].Offset || + Buckets[i].Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = Buckets[i].Elements[j].Offset; + Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); + for (auto &E : Buckets[i].Elements) { + if (E.Offset) + E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); + } + + std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); + break; + } + + const SCEVAddRecExpr *BasePtrSCEV = + cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV); + if (!BasePtrSCEV->isAffine()) + continue; + + DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); + assert(BasePtrSCEV->getLoop() == L && + "AddRec for the wrong loop?"); + + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = Buckets[i].Elements.begin()->Instr; + Value *BasePtr = GetPointerOperand(MemI); + assert(BasePtr && "No pointer operand"); + + Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); + Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), + BasePtr->getType()->getPointerAddressSpace()); + + const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); + if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) + continue; + + const SCEVConstant *BasePtrIncSCEV = + dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)); + if (!BasePtrIncSCEV) + continue; + BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); + if (!isSafeToExpand(BasePtrStartSCEV, *SE)) + continue; + + DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); + + PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, + MemI->hasName() ? MemI->getName() + ".phi" : "", + Header->getFirstNonPHI()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, + LoopPredecessor->getTerminator()); + + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to add it the right number of times. + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); + PI != PE; ++PI) { + if (*PI != LoopPredecessor) + continue; + + NewPHI->addIncoming(BasePtrStart, LoopPredecessor); + } + + Instruction *InsPoint = &*Header->getFirstInsertionPt(); + GetElementPtrInst *PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); + PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); + PI != PE; ++PI) { + if (*PI == LoopPredecessor) + continue; + + NewPHI->addIncoming(PtrInc, *PI); + } + + Instruction *NewBasePtr; + if (PtrInc->getType() != BasePtr->getType()) + NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), + PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint); + else + NewBasePtr = PtrInc; + + if (Instruction *IDel = dyn_cast<Instruction>(BasePtr)) + BBChanged.insert(IDel->getParent()); + BasePtr->replaceAllUsesWith(NewBasePtr); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr); + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet<Value *, 16> NewPtrs; + NewPtrs.insert( NewBasePtr); + + for (auto I = std::next(Buckets[i].Elements.begin()), + IE = Buckets[i].Elements.end(); I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + Instruction *RealNewPtr; + if (!I->Offset || I->Offset->getValue()->isZero()) { + RealNewPtr = NewBasePtr; + } else { + Instruction *PtrIP = dyn_cast<Instruction>(Ptr); + if (PtrIP && isa<Instruction>(NewBasePtr) && + cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) + PtrIP = 0; + else if (isa<PHINode>(PtrIP)) + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); + else if (!PtrIP) + PtrIP = I->Instr; + + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, I->Offset->getValue(), + I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); + if (!PtrIP) + NewPtr->insertAfter(cast<Instruction>(PtrInc)); + NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); + RealNewPtr = NewPtr; + } + + if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) + BBChanged.insert(IDel->getParent()); + + Instruction *ReplNewPtr; + if (Ptr->getType() != RealNewPtr->getType()) { + ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), + Ptr->hasName() ? Ptr->getName() + ".cast" : ""); + ReplNewPtr->insertAfter(RealNewPtr); + } else + ReplNewPtr = RealNewPtr; + + Ptr->replaceAllUsesWith(ReplNewPtr); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + + NewPtrs.insert(RealNewPtr); + } + + MadeChange = true; + } + + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + if (BBChanged.count(*I)) + DeleteDeadPHIs(*I); + } + + return MadeChange; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp new file mode 100644 index 0000000..44a692d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -0,0 +1,219 @@ +//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower PPC MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "PPCSubtarget.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +using namespace llvm; + +static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) { + return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>(); +} + + +static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ + const TargetMachine &TM = AP.TM; + Mangler *Mang = AP.Mang; + const DataLayout &DL = AP.getDataLayout(); + MCContext &Ctx = AP.OutContext; + bool isDarwin = TM.getTargetTriple().isOSDarwin(); + + SmallString<128> Name; + StringRef Suffix; + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) { + if (isDarwin) + Suffix = "$stub"; + } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) + Suffix = "$non_lazy_ptr"; + + if (!Suffix.empty()) + Name += DL.getPrivateGlobalPrefix(); + + unsigned PrefixLen = Name.size(); + + if (!MO.isGlobal()) { + assert(MO.isSymbol() && "Isn't a symbol reference"); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); + } else { + const GlobalValue *GV = MO.getGlobal(); + TM.getNameWithPrefix(Name, GV, *Mang); + } + + unsigned OrigLen = Name.size() - PrefixLen; + + Name += Suffix; + MCSymbol *Sym = Ctx.getOrCreateSymbol(Name); + StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen); + + // If the target flags on the operand changes the name of the symbol, do that + // before we return the symbol. + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) { + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI(AP).getFnStubEntry(Sym); + if (StubSym.getPointer()) + return Sym; + + if (MO.isGlobal()) { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(AP.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } else { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Ctx.getOrCreateSymbol(OrigName), false); + } + return Sym; + } + + // If the symbol reference is actually to a non_lazy_ptr, not to the symbol, + // then add the suffix. + if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) { + MachineModuleInfoMachO &MachO = getMachOMMI(AP); + + MachineModuleInfoImpl::StubValueTy &StubSym = + (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? + MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym); + + if (!StubSym.getPointer()) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = MachineModuleInfoImpl:: + StubValueTy(AP.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + + return Sym; +} + +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + AsmPrinter &Printer, bool isDarwin) { + MCContext &Ctx = Printer.OutContext; + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + + unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK; + + switch (access) { + case PPCII::MO_TPREL_LO: + RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO; + break; + case PPCII::MO_TPREL_HA: + RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA; + break; + case PPCII::MO_DTPREL_LO: + RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO; + break; + case PPCII::MO_TLSLD_LO: + RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO; + break; + case PPCII::MO_TOC_LO: + RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO; + break; + case PPCII::MO_TLS: + RefKind = MCSymbolRefExpr::VK_PPC_TLS; + break; + } + + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin) + RefKind = MCSymbolRefExpr::VK_PLT; + + const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx); + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(MO.getOffset(), Ctx), + Ctx); + + // Subtract off the PIC base if required. + if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) { + const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + + const MCExpr *PB = MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); + Expr = MCBinaryExpr::createSub(Expr, PB, Ctx); + } + + // Add ha16() / lo16() markers if required. + switch (access) { + case PPCII::MO_LO: + Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx); + break; + case PPCII::MO_HA: + Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx); + break; + } + + return MCOperand::createExpr(Expr); +} + +void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + AsmPrinter &AP, bool isDarwin) { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + assert(MO.getReg() > PPC::NoRegister && + MO.getReg() < PPC::NUM_TARGET_REGS && + "Invalid register for this target!"); + MCOp = MCOperand::createReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( + MO.getMBB()->getSymbol(), AP.OutContext)); + break; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); + break; + case MachineOperand::MO_BlockAddress: + MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP, + isDarwin); + break; + case MachineOperand::MO_RegisterMask: + continue; + } + + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp new file mode 100644 index 0000000..fe339d7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -0,0 +1,230 @@ +//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass performs peephole optimizations to clean up ugly code +// sequences at the MachineInstruction layer. It runs at the end of +// the SSA phases, following VSX swap removal. A pass of dead code +// elimination follows this one for quick clean-up of any dead +// instructions introduced here. Although we could do this as callbacks +// from the generic peephole pass, this would have a couple of bad +// effects: it might remove optimization opportunities for VSX swap +// removal, and it would miss cleanups made possible following VSX +// swap removal. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-mi-peepholes" + +namespace llvm { + void initializePPCMIPeepholePass(PassRegistry&); +} + +namespace { + +struct PPCMIPeephole : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + PPCMIPeephole() : MachineFunctionPass(ID) { + initializePPCMIPeepholePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool simplifyCode(void); + + // Find the "true" register represented by SrcReg (following chains + // of copies and subreg_to_reg operations). + unsigned lookThruCopyLike(unsigned SrcReg); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + initialize(MF); + return simplifyCode(); + } +}; + +// Initialize class variables. +void PPCMIPeephole::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); + DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); + DEBUG(MF->dump()); +} + +// Perform peephole optimizations. +bool PPCMIPeephole::simplifyCode(void) { + bool Simplified = false; + MachineInstr* ToErase = nullptr; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + // If the previous instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // Ignore debug instructions. + if (MI.isDebugValue()) + continue; + + // Per-opcode peepholes. + switch (MI.getOpcode()) { + + default: + break; + + case PPC::XXPERMDI: { + // Perform simplifications of 2x64 vector swaps and splats. + // A swap is identified by an immediate value of 2, and a splat + // is identified by an immediate value of 0 or 3. + int Immed = MI.getOperand(3).getImm(); + + if (Immed != 1) { + + // For each of these simplifications, we need the two source + // regs to match. Unfortunately, MachineCSE ignores COPY and + // SUBREG_TO_REG, so for example we can see + // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. + // We have to look through chains of COPY and SUBREG_TO_REG + // to find the real source values for comparison. + unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg()); + unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg()); + + if (TrueReg1 == TrueReg2 + && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { + MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); + + // If this is a splat or a swap fed by another splat, we + // can replace it with a copy. + if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { + unsigned FeedImmed = DefMI->getOperand(3).getImm(); + unsigned FeedReg1 + = lookThruCopyLike(DefMI->getOperand(1).getReg()); + unsigned FeedReg2 + = lookThruCopyLike(DefMI->getOperand(2).getReg()); + + if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { + DEBUG(dbgs() + << "Optimizing splat/swap or splat/splat " + "to splat/copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; + } + + // If this is a splat fed by a swap, we can simplify modify + // the splat to splat the other value from the swap's input + // parameter. + else if ((Immed == 0 || Immed == 3) + && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/splat => splat: "); + DEBUG(MI.dump()); + MI.getOperand(1).setReg(DefMI->getOperand(1).getReg()); + MI.getOperand(2).setReg(DefMI->getOperand(2).getReg()); + MI.getOperand(3).setImm(3 - Immed); + Simplified = true; + } + + // If this is a swap fed by a swap, we can replace it + // with a copy from the first swap's input. + else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/swap => copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(DefMI->getOperand(1)); + ToErase = &MI; + Simplified = true; + } + } + } + } + break; + } + } + } + + // If the last instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + } + + return Simplified; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search. +unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) { + + while (true) { + + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) + return CopySrcReg; + + SrcReg = CopySrcReg; + } +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) + +char PPCMIPeephole::ID = 0; +FunctionPass* +llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp new file mode 100644 index 0000000..95f1631 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -0,0 +1,25 @@ +//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCMachineFunctionInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +void PPCFunctionInfo::anchor() { } + +MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + Twine(MF.getFunctionNumber()) + + "$poff"); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h new file mode 100644 index 0000000..607cdf6 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -0,0 +1,205 @@ +//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of MachineFunctionInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// PPCFunctionInfo - This class is derived from MachineFunction private +/// PowerPC target-specific information for each MachineFunction. +class PPCFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + + /// FramePointerSaveIndex - Frame index of where the old frame pointer is + /// stored. Also used as an anchor for instructions that need to be altered + /// when using frame pointers (dyna_add, dyna_sub.) + int FramePointerSaveIndex; + + /// ReturnAddrSaveIndex - Frame index of where the return address is stored. + /// + int ReturnAddrSaveIndex; + + /// Frame index where the old base pointer is stored. + int BasePointerSaveIndex; + + /// Frame index where the old PIC base pointer is stored. + int PICBasePointerSaveIndex; + + /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current + /// function. This is only valid after the initial scan of the function by + /// PEI. + bool MustSaveLR; + + /// Does this function have any stack spills. + bool HasSpills; + + /// Does this function spill using instructions with only r+r (not r+i) + /// forms. + bool HasNonRISpills; + + /// SpillsCR - Indicates whether CR is spilled in the current function. + bool SpillsCR; + + /// Indicates whether VRSAVE is spilled in the current function. + bool SpillsVRSAVE; + + /// LRStoreRequired - The bool indicates whether there is some explicit use of + /// the LR/LR8 stack slot that is not obvious from scanning the code. This + /// requires that the code generator produce a store of LR to the stack on + /// entry, even though LR may otherwise apparently not be used. + bool LRStoreRequired; + + /// This function makes use of the PPC64 ELF TOC base pointer (register r2). + bool UsesTOCBasePtr; + + /// MinReservedArea - This is the frame size that is at least reserved in a + /// potential caller (parameter+linkage area). + unsigned MinReservedArea; + + /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum + /// amount the stack pointer is adjusted to make the frame bigger for tail + /// calls. Used for creating an area before the register spill area. + int TailCallSPDelta; + + /// HasFastCall - Does this function contain a fast call. Used to determine + /// how the caller's stack pointer should be calculated (epilog/dynamicalloc). + bool HasFastCall; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + /// VarArgsStackOffset - StackOffset for start of stack + /// arguments. + int VarArgsStackOffset; + /// VarArgsNumGPR - Index of the first unused integer + /// register for parameter passing. + unsigned VarArgsNumGPR; + /// VarArgsNumFPR - Index of the first unused double + /// register for parameter passing. + unsigned VarArgsNumFPR; + + /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4. + int CRSpillFrameIndex; + + /// If any of CR[2-4] need to be saved in the prologue and restored in the + /// epilogue then they are added to this array. This is used for the + /// 64-bit SVR4 ABI. + SmallVector<unsigned, 3> MustSaveCRs; + + /// Hold onto our MachineFunction context. + MachineFunction &MF; + + /// Whether this uses the PIC Base register or not. + bool UsesPICBase; + +public: + explicit PPCFunctionInfo(MachineFunction &MF) + : FramePointerSaveIndex(0), + ReturnAddrSaveIndex(0), + BasePointerSaveIndex(0), + PICBasePointerSaveIndex(0), + HasSpills(false), + HasNonRISpills(false), + SpillsCR(false), + SpillsVRSAVE(false), + LRStoreRequired(false), + UsesTOCBasePtr(false), + MinReservedArea(0), + TailCallSPDelta(0), + HasFastCall(false), + VarArgsFrameIndex(0), + VarArgsStackOffset(0), + VarArgsNumGPR(0), + VarArgsNumFPR(0), + CRSpillFrameIndex(0), + MF(MF), + UsesPICBase(0) {} + + int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } + void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } + + int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; } + void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; } + + int getBasePointerSaveIndex() const { return BasePointerSaveIndex; } + void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; } + + int getPICBasePointerSaveIndex() const { return PICBasePointerSaveIndex; } + void setPICBasePointerSaveIndex(int Idx) { PICBasePointerSaveIndex = Idx; } + + unsigned getMinReservedArea() const { return MinReservedArea; } + void setMinReservedArea(unsigned size) { MinReservedArea = size; } + + int getTailCallSPDelta() const { return TailCallSPDelta; } + void setTailCallSPDelta(int size) { TailCallSPDelta = size; } + + /// MustSaveLR - This is set when the prolog/epilog inserter does its initial + /// scan of the function. It is true if the LR/LR8 register is ever explicitly + /// defined/clobbered in the machine function (e.g. by calls and movpctolr, + /// which is used in PIC generation), or if the LR stack slot is explicitly + /// referenced by builtin_return_address. + void setMustSaveLR(bool U) { MustSaveLR = U; } + bool mustSaveLR() const { return MustSaveLR; } + + void setHasSpills() { HasSpills = true; } + bool hasSpills() const { return HasSpills; } + + void setHasNonRISpills() { HasNonRISpills = true; } + bool hasNonRISpills() const { return HasNonRISpills; } + + void setSpillsCR() { SpillsCR = true; } + bool isCRSpilled() const { return SpillsCR; } + + void setSpillsVRSAVE() { SpillsVRSAVE = true; } + bool isVRSAVESpilled() const { return SpillsVRSAVE; } + + void setLRStoreRequired() { LRStoreRequired = true; } + bool isLRStoreRequired() const { return LRStoreRequired; } + + void setUsesTOCBasePtr() { UsesTOCBasePtr = true; } + bool usesTOCBasePtr() const { return UsesTOCBasePtr; } + + void setHasFastCall() { HasFastCall = true; } + bool hasFastCall() const { return HasFastCall;} + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + int getVarArgsStackOffset() const { return VarArgsStackOffset; } + void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; } + + unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; } + void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; } + + unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; } + void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; } + + int getCRSpillFrameIndex() const { return CRSpillFrameIndex; } + void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; } + + const SmallVectorImpl<unsigned> & + getMustSaveCRs() const { return MustSaveCRs; } + void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); } + + void setUsesPICBase(bool uses) { UsesPICBase = uses; } + bool usesPICBase() const { return UsesPICBase; } + + MCSymbol *getPICOffsetSymbol() const; +}; + +} // end of namespace llvm + + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h new file mode 100644 index 0000000..8a1d680 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h @@ -0,0 +1,6591 @@ +//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle without using vperm. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H +#define LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H + +// 31 entries have cost 0 +// 292 entries have cost 1 +// 1384 entries have cost 2 +// 3061 entries have cost 3 +// 1733 entries have cost 4 +// 60 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 202162278U, // <0,0,0,0>: Cost 1 vspltisw0 LHS + 1140850790U, // <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS + 2617247181U, // <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0> + 2635163787U, // <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0> + 1543507254U, // <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS + 2281701705U, // <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5> + 2617250133U, // <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0> + 2659054575U, // <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,0,u>: Cost 1 vspltisw0 LHS + 1141686282U, // <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1> + 67944550U, // <0,0,1,1>: Cost 1 vmrghw LHS, LHS + 1685241958U, // <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2215870716U, // <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1141727570U, // <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 2215428562U, // <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7> + 2215428589U, // <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7> + 2659062768U, // <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1> + 67945117U, // <0,0,1,u>: Cost 1 vmrghw LHS, LHS + 2684356045U, // <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0> + 2216009830U, // <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2216009901U, // <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2> + 2698290853U, // <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0> + 3289751890U, // <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5> + 3758098275U, // <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1> + 2684356538U, // <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7> + 3758098410U, // <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1> + 2216010397U, // <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS + 2702272651U, // <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0> + 2216656998U, // <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 3844669704U, // <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3> + 2216657148U, // <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0> + 2684357122U, // <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6> + 3732820066U, // <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0> + 3778005624U, // <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7> + 3374713464U, // <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7> + 2216657565U, // <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217361408U, // <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0> + 1143619686U, // <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 3291103405U, // <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2> + 3827269988U, // <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5> + 1143619922U, // <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1610616118U, // <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 3758099833U, // <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2> + 3854107016U, // <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5> + 1143620253U, // <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2284396544U, // <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0> + 2218025062U, // <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS + 3758100203U, // <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3> + 3395966100U, // <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3> + 3804549052U, // <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5> + 2302314964U, // <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5> + 2785821138U, // <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 3395966428U, // <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7> + 2787148260U, // <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7> + 2684358997U, // <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0> + 2218631270U, // <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2684359162U, // <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3> + 3758101042U, // <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5> + 3732843830U, // <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS + 3758101227U, // <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1> + 2684359480U, // <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6> + 2724836173U, // <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0> + 2725499806U, // <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0> + 2726163439U, // <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 2219311206U, // <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS + 3868557900U, // <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3> + 3377400112U, // <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3> + 2684360038U, // <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6> + 3732852834U, // <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0> + 3871507060U, // <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7> + 2303658616U, // <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7> + 2726163439U, // <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,u,0>: Cost 1 vspltisw0 LHS + 72589414U, // <0,0,u,1>: Cost 1 vmrghw LHS, LHS + 1685242525U, // <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2220073212U, // <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1146331474U, // <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 1610619034U, // <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 2785821138U, // <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 2659120119U, // <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u> + 72589981U, // <0,0,u,u>: Cost 1 vmrghw LHS, LHS + 2698297344U, // <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0> + 1624555622U, // <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 2758984428U, // <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1> + 2635237524U, // <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0> + 2693652818U, // <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5> + 2281701714U, // <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5> + 2698297846U, // <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7> + 2659128312U, // <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0> + 1624556189U, // <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 1543585802U, // <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1> + 1141728052U, // <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1141728150U, // <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2295644334U, // <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3> + 1543589174U, // <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS + 2290999634U, // <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5> + 2617332135U, // <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1> + 2617332720U, // <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1> + 1142171004U, // <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0> + 1561509990U, // <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS + 2623308516U, // <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2> + 2698298984U, // <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1561513270U, // <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS + 2647199304U, // <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2> + 2698299322U, // <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7> + 1585402874U, // <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2698299540U, // <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0> + 3290399540U, // <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1> + 2698299720U, // <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0> + 2698299804U, // <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3> + 2698299906U, // <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6> + 3832726521U, // <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0> + 2724842160U, // <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0> + 2706926275U, // <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1> + 2698300190U, // <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2> + 2635268198U, // <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS + 2217362228U, // <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1> + 2217362326U, // <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0> + 2635270296U, // <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4> + 2635271478U, // <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS + 1624558902U, // <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2659160910U, // <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1> + 2659161084U, // <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4> + 1624559145U, // <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 3832726639U, // <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1> + 2714889871U, // <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1> + 2302314646U, // <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 3834717321U, // <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0> + 3832726679U, // <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5> + 2717544403U, // <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1> + 2718208036U, // <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1> + 3792613493U, // <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1> + 2719535302U, // <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1> + 2659172454U, // <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS + 3832726735U, // <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7> + 2724844026U, // <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3> + 3775361608U, // <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0> + 2659175734U, // <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS + 3832726771U, // <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7> + 2724844344U, // <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6> + 1651102542U, // <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1> + 1651766175U, // <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1> + 2724844536U, // <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0> + 3377397770U, // <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1> + 2698302636U, // <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0> + 2728162531U, // <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1> + 2724844902U, // <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6> + 3377398098U, // <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5> + 2724845076U, // <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0> + 2724845164U, // <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7> + 2724845186U, // <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2> + 1561559142U, // <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS + 1146331956U, // <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1146332054U, // <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1561562422U, // <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS + 1624561818U, // <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2220074191U, // <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7> + 1585452032U, // <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2214593997U, // <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0> + 2214675999U, // <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1> + 2214594152U, // <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2> + 1207959654U, // <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 3709054262U, // <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS + 3375350836U, // <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5> + 2214594490U, // <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7> + 3288336362U, // <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1> + 1207959659U, // <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS + 2215871994U, // <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2215470623U, // <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1141728872U, // <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1141728934U, // <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2215872323U, // <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2215872405U, // <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1141729210U, // <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2215430122U, // <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1141729368U, // <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3> + 3289736698U, // <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0> + 3289744927U, // <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1> + 2216011368U, // <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2> + 2216019622U, // <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1> + 3289769795U, // <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5> + 3289778069U, // <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6> + 2216044474U, // <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7> + 3732960259U, // <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2> + 2216061016U, // <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3> + 2758985382U, // <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1> + 2758985392U, // <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2> + 3290400360U, // <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2> + 2758985408U, // <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0> + 2758985422U, // <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5> + 2785822424U, // <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6> + 3290400698U, // <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7> + 2765915876U, // <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0> + 2758985453U, // <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0> + 3291104762U, // <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0> + 2217362979U, // <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2217363048U, // <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2> + 2217363110U, // <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1> + 3291105087U, // <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1> + 3291105173U, // <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6> + 2217363386U, // <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7> + 3788639688U, // <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0> + 2217363515U, // <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1> + 3376054371U, // <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0> + 3788639888U, // <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2> + 3376055912U, // <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2> + 2302312550U, // <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS + 3376054375U, // <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4> + 3374728244U, // <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5> + 3805229154U, // <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0> + 3376055512U, // <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7> + 2302312555U, // <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS + 3709100134U, // <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS + 3709100950U, // <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0> + 3709102010U, // <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7> + 2758985658U, // <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7> + 3709103414U, // <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS + 3732992098U, // <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0> + 3292374970U, // <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7> + 3798594383U, // <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2> + 2758985703U, // <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7> + 3788641274U, // <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2> + 3377398508U, // <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1> + 3377398590U, // <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2> + 2303656038U, // <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 3709111606U, // <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS + 3377398836U, // <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5> + 3803903447U, // <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2> + 3293054954U, // <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1> + 2303656043U, // <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 2220074490U, // <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2220074527U, // <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1146332776U, // <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1146332838U, // <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2220074819U, // <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2220074901U, // <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1146333114U, // <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2220074986U, // <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1146333243U, // <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1> + 2629410816U, // <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0> + 2753530006U, // <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2> + 2629412301U, // <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0> + 2214594972U, // <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3> + 2758985908U, // <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5> + 3733016674U, // <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0> + 3777364488U, // <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7> + 2281703354U, // <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7> + 2758985941U, // <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2> + 1141729430U, // <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2215471334U, // <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2215471425U, // <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1141729692U, // <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1141729794U, // <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2215430738U, // <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5> + 2215430776U, // <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7> + 2295646138U, // <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7> + 1141730078U, // <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2758986032U, // <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3> + 3709141910U, // <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0> + 3289753921U, // <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2> + 2770929992U, // <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0> + 3289754114U, // <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6> + 3362095460U, // <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5> + 3832727910U, // <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3> + 3365414842U, // <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7> + 2771298677U, // <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0> + 2216659094U, // <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2> + 3290409190U, // <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1> + 2703624496U, // <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3> + 2216683932U, // <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3> + 2216692226U, // <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6> + 3733041250U, // <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0> + 3832727988U, // <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0> + 3374712762U, // <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7> + 2216725278U, // <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2> + 2217363606U, // <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2> + 3291105510U, // <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1> + 3291105601U, // <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2> + 2217363868U, // <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3> + 2217363970U, // <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6> + 2758986242U, // <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6> + 3727077685U, // <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4> + 3364767674U, // <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7> + 2217364254U, // <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2> + 3832728102U, // <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6> + 3405916003U, // <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1> + 3376055840U, // <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2> + 3376055679U, // <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3> + 3376055194U, // <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4> + 3859565138U, // <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5> + 2727514210U, // <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 3376056250U, // <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7> + 2727514210U, // <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 2758986360U, // <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 3709174678U, // <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0> + 3795284411U, // <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3> + 3709175980U, // <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6> + 3833096860U, // <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7> + 3376728235U, // <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5> + 3859565229U, // <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6> + 2773879472U, // <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0> + 2758986360U, // <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 2303656854U, // <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0> + 3807229018U, // <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u> + 2727515284U, // <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3> + 3377399410U, // <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3> + 3377398682U, // <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4> + 3801257409U, // <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7> + 3377399980U, // <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6> + 3375409082U, // <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7> + 2731497082U, // <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3> + 1146333334U, // <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2220075238U, // <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2220075329U, // <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1146333596U, // <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1146333698U, // <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2758986566U, // <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6> + 2803739472U, // <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7> + 2295703482U, // <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7> + 1146333982U, // <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2214595473U, // <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0> + 2693677158U, // <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS + 3839437689U, // <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3> + 3709200559U, // <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0> + 2693677394U, // <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5> + 1140854070U, // <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 3767419409U, // <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7> + 3854109604U, // <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1> + 1140854313U, // <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS + 1141689234U, // <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2215431114U, // <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3> + 2215431221U, // <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635466928U, // <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1> + 1141689552U, // <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 67947830U, // <0,4,1,5>: Cost 1 vmrghw LHS, RHS + 2215431545U, // <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2659357716U, // <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1> + 67948073U, // <0,4,1,u>: Cost 1 vmrghw LHS, RHS + 3767420369U, // <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4> + 3767420451U, // <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5> + 3767420520U, // <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2> + 2698323625U, // <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4> + 3709218102U, // <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS + 2216013110U, // <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767420858U, // <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7> + 3774719981U, // <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4> + 2216013353U, // <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767421078U, // <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2> + 3776710880U, // <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4> + 3833097325U, // <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4> + 3767421340U, // <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3> + 3767421442U, // <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6> + 2216660278U, // <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 3833097361U, // <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4> + 3780692678U, // <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4> + 2216660521U, // <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS + 2617573416U, // <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4> + 2217364450U, // <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0> + 3691316771U, // <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5> + 3709233331U, // <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4> + 2785823952U, // <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4> + 1143622966U, // <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 3691319723U, // <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5> + 3854109932U, // <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5> + 1143623209U, // <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS + 2635497574U, // <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS + 2635498390U, // <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0> + 3709240936U, // <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2> + 2635499700U, // <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5> + 2635500854U, // <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS + 2785824044U, // <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6> + 1685245238U, // <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659390488U, // <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5> + 1685245256U, // <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 3839438161U, // <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7> + 3798610347U, // <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5> + 3798610426U, // <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3> + 3795956237U, // <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4> + 3733138742U, // <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS + 2218634550U, // <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS + 3798610744U, // <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6> + 2724868945U, // <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4> + 2725532578U, // <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4> + 3383371465U, // <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0> + 3800601668U, // <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4> + 3775386826U, // <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3> + 3801928934U, // <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4> + 3721202998U, // <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS + 2780368328U, // <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 3383372686U, // <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6> + 3854110170U, // <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0> + 2780368328U, // <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 1146334098U, // <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2220076002U, // <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0> + 2220076085U, // <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635524279U, // <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u> + 1146334416U, // <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 72592694U, // <0,4,u,5>: Cost 1 vmrghw LHS, RHS + 1685245481U, // <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659415067U, // <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u> + 72592937U, // <0,4,u,u>: Cost 1 vmrghw LHS, RHS + 2281704337U, // <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0> + 2704965734U, // <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 3778707666U, // <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3> + 3778707708U, // <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0> + 2687050057U, // <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5> + 2214596612U, // <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5> + 2785824372U, // <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1> + 3854110332U, // <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0> + 2704966301U, // <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 1567768678U, // <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS + 2312236570U, // <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1> + 2215431915U, // <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641512598U, // <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2> + 1567771538U, // <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1> + 1141690372U, // <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1141690466U, // <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2641515514U, // <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2> + 1141690615U, // <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5> + 3772736973U, // <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0> + 3778709024U, // <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2> + 3778709096U, // <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2> + 3778709158U, // <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1> + 3772737275U, // <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5> + 3859566351U, // <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3> + 3778709434U, // <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7> + 3805251562U, // <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1> + 3775391807U, // <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5> + 2704967830U, // <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2> + 3776719073U, // <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5> + 3777382706U, // <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5> + 3778709887U, // <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1> + 2704968148U, // <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5> + 3857428317U, // <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0> + 3364096514U, // <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6> + 3780700871U, // <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5> + 2707622680U, // <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5> + 2728856466U, // <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1> + 3697361674U, // <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4> + 3697362601U, // <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4> + 3364766635U, // <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3> + 2217365428U, // <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6> + 2704969014U, // <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 2785824700U, // <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5> + 3364766963U, // <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7> + 2704969257U, // <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 3846148050U, // <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0> + 2326203282U, // <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1> + 3291746027U, // <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3> + 3376054482U, // <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3> + 3790655366U, // <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5> + 2785824772U, // <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5> + 2724876386U, // <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0> + 3858903057U, // <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0> + 2736820484U, // <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0> + 2659467366U, // <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS + 3859566643U, // <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7> + 3798618618U, // <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3> + 3852857410U, // <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4> + 2659470646U, // <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS + 2659471458U, // <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 3832729696U, // <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7> + 1712083042U, // <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0> + 1712156779U, // <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0> + 2731512826U, // <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2> + 3859566717U, // <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0> + 3798619284U, // <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3> + 3778712803U, // <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1> + 2728858936U, // <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5> + 3859566753U, // <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0> + 3377398135U, // <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6> + 3798619686U, // <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0> + 2731513468U, // <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5> + 1567826022U, // <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS + 2704971566U, // <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 2220076779U, // <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641569942U, // <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2> + 1567828889U, // <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u> + 1146335236U, // <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1146335330U, // <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 1713410308U, // <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0> + 1713484045U, // <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0> + 2214596949U, // <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0> + 2214678951U, // <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1> + 2214597114U, // <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3> + 3852857653U, // <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4> + 3832729919U, // <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5> + 3721293427U, // <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0> + 2214597432U, // <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6> + 1207962934U, // <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 1207962935U, // <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS + 2215432481U, // <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2> + 2215432615U, // <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1141690874U, // <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2215432754U, // <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2215432817U, // <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5> + 2215432939U, // <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1> + 1141691192U, // <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221905718U, // <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 1221905719U, // <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS + 3852857787U, // <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3> + 3289764265U, // <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3> + 3289690618U, // <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3> + 3862589907U, // <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0> + 3733253430U, // <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS + 3733254242U, // <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0> + 3777390522U, // <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7> + 2785825274U, // <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3> + 2785825283U, // <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3> + 3777390742U, // <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2> + 3863106066U, // <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0> + 3777390899U, // <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6> + 3290436146U, // <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5> + 3779381762U, // <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6> + 3779381798U, // <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6> + 3733262920U, // <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0> + 2300972342U, // <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2300972343U, // <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS + 3802606482U, // <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1> + 2217365931U, // <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2217366010U, // <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3> + 3291107890U, // <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5> + 3291099805U, // <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4> + 3777391926U, // <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS + 2217366328U, // <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6> + 2291027254U, // <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 2291027255U, // <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS + 3852858033U, // <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6> + 3395964532U, // <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1> + 3864507069U, // <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0> + 3376056678U, // <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3> + 3721334070U, // <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS + 3395964860U, // <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5> + 3864802017U, // <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0> + 2302315830U, // <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 2302315831U, // <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS + 3852858108U, // <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0> + 3398624745U, // <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1> + 2218668538U, // <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3> + 3292418610U, // <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5> + 3733286198U, // <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS + 3797299889U, // <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6> + 2785825592U, // <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6> + 2785825602U, // <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7> + 2785825611U, // <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7> + 2785825614U, // <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1> + 2758988632U, // <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2> + 3377400084U, // <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2> + 2792166248U, // <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0> + 2785825654U, // <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5> + 2785825664U, // <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 3859567493U, // <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2> + 2303659318U, // <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303659319U, // <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS + 2785825695U, // <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1> + 2220077479U, // <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1146335738U, // <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2792829881U, // <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0> + 2785825735U, // <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5> + 2785825664U, // <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 1146336056U, // <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221963062U, // <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 1221963063U, // <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS + 2653593600U, // <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0> + 2706309222U, // <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 3709421498U, // <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7> + 2281705978U, // <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3> + 2785825816U, // <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5> + 2785825826U, // <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6> + 2653598037U, // <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0> + 2214598252U, // <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7> + 2706309789U, // <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 1141691386U, // <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2215433290U, // <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1> + 2706310038U, // <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0> + 2322190842U, // <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3> + 1141691750U, // <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2215433654U, // <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5> + 2653606230U, // <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1> + 1141692012U, // <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1141692034U, // <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 2785825940U, // <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3> + 3768108576U, // <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2> + 3780052584U, // <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2> + 2794820780U, // <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0> + 3859641528U, // <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3> + 3733327970U, // <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0> + 3778062266U, // <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7> + 3733328944U, // <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2> + 2795189465U, // <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0> + 2324861026U, // <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0> + 3780053233U, // <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3> + 3780053296U, // <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3> + 3778062725U, // <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7> + 3780053506U, // <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6> + 3803941469U, // <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7> + 2706311800U, // <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7> + 3398603586U, // <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7> + 2707639066U, // <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7> + 2217366522U, // <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2> + 3727369110U, // <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0> + 3291108500U, // <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3> + 3727370872U, // <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7> + 2217366886U, // <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6> + 2706312502U, // <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 3786026321U, // <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7> + 2217367148U, // <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7> + 2706312745U, // <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2322223202U, // <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0> + 3399946987U, // <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1> + 3291780244U, // <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3> + 3727378582U, // <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2> + 3727379766U, // <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS + 3859568054U, // <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5> + 2785826241U, // <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7> + 3395965762U, // <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7> + 2787153363U, // <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7> + 2785826268U, // <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7> + 3780055420U, // <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3> + 3859568110U, // <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7> + 3874534903U, // <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7> + 3859641856U, // <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7> + 3733360738U, // <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0> + 3859568145U, // <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6> + 2797770260U, // <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0> + 2797843997U, // <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0> + 2785826342U, // <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0> + 3727393686U, // <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0> + 3868563003U, // <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3> + 3377397988U, // <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3> + 2219349350U, // <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6> + 3859568217U, // <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6> + 2730202588U, // <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7> + 2785826412U, // <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7> + 2731529854U, // <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7> + 1146336250U, // <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2706315054U, // <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 2653660845U, // <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u> + 2322248186U, // <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3> + 1146336614U, // <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2706315418U, // <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2653663581U, // <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u> + 1146336876U, // <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1146336898U, // <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 202162278U, // <0,u,0,0>: Cost 1 vspltisw0 LHS + 1624612966U, // <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS + 2629780986U, // <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0> + 1207959708U, // <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 1544097078U, // <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS + 1140856986U, // <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 2698355253U, // <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7> + 1207962952U, // <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 202162278U, // <0,u,0,u>: Cost 1 vspltisw0 LHS + 1142134483U, // <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2> + 67950382U, // <0,u,1,1>: Cost 1 vmrghw LHS, LHS + 1142175624U, // <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 1142175676U, // <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1> + 1142134847U, // <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 67950746U, // <0,u,1,5>: Cost 1 vmrghw LHS, RHS + 1142175952U, // <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221905736U, // <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 67950949U, // <0,u,1,u>: Cost 1 vmrghw LHS, LHS + 1562026086U, // <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS + 2216015662U, // <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2698356328U, // <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2> + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1562029366U, // <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS + 2216016026U, // <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 2698356666U, // <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7> + 1585919033U, // <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2758989756U, // <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1> + 2216662830U, // <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 2703665461U, // <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u> + 2758989782U, // <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0> + 2758989796U, // <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5> + 2216663194U, // <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 2706319993U, // <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u> + 2300972360U, // <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2216663397U, // <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217367251U, // <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2> + 1143625518U, // <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 2217367432U, // <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3> + 2217367484U, // <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1> + 1143619922U, // <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1143625882U, // <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 2217367760U, // <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7> + 2291027272U, // <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 1143626085U, // <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2635792486U, // <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS + 2635793302U, // <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0> + 2302314646U, // <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 2635794648U, // <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5> + 2635795766U, // <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS + 2717601754U, // <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u> + 1685248154U, // <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2302315848U, // <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 1685248172U, // <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2759358645U, // <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7> + 2218637102U, // <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2724901370U, // <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3> + 2758990032U, // <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7> + 2659691830U, // <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS + 2659471458U, // <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 2724901688U, // <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6> + 1651159893U, // <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u> + 1651823526U, // <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u> + 2785827072U, // <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1> + 2803964168U, // <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0> + 2727556249U, // <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u> + 2303656092U, // <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 2785827112U, // <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5> + 2785827122U, // <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6> + 2730210781U, // <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u> + 2303659336U, // <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303656097U, // <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 202162278U, // <0,u,u,0>: Cost 1 vspltisw0 LHS + 72595246U, // <0,u,u,1>: Cost 1 vmrghw LHS, LHS + 1146337160U, // <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1146337343U, // <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 72595610U, // <0,u,u,5>: Cost 1 vmrghw LHS, RHS + 1146337488U, // <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221963080U, // <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2756853760U, // <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0> + 1677803530U, // <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1> + 3759497387U, // <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0> + 2686419196U, // <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0> + 2751766565U, // <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1> + 2687746462U, // <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0> + 3776086518U, // <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7> + 2689073728U, // <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0> + 1678319689U, // <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1> + 2287091712U, // <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0> + 1147568230U, // <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS + 1683112038U, // <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 3294970108U, // <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0> + 2623892790U, // <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS + 2647781007U, // <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1> + 2791948430U, // <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 3721524218U, // <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2> + 1683112092U, // <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2222112768U, // <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0> + 1148371046U, // <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 3356862524U, // <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2> + 2702345894U, // <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1> + 2222113106U, // <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5> + 2299709908U, // <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 3760162746U, // <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7> + 3369470584U, // <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7> + 1148371613U, // <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 2686421142U, // <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2> + 2283128486U, // <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1> + 3296305326U, // <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3> + 3760163199U, // <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1> + 3760163330U, // <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6> + 3779406377U, // <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0> + 3865690416U, // <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7> + 3366824568U, // <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7> + 2707655452U, // <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0> + 2734861202U, // <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1> + 2756854098U, // <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5> + 3830595931U, // <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5> + 3296968960U, // <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4> + 3830595949U, // <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5> + 2686422326U, // <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 3297378806U, // <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7> + 3810594248U, // <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0> + 2686422569U, // <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2284470272U, // <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0> + 2284471974U, // <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1> + 3809267435U, // <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3> + 3297968384U, // <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4> + 2284471977U, // <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4> + 3721555603U, // <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5> + 3792679010U, // <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0> + 3792679037U, // <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0> + 2284471981U, // <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u> + 3356893184U, // <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0> + 2224676966U, // <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS + 3298295985U, // <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6> + 3298345212U, // <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0> + 2224972114U, // <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 3808604907U, // <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1> + 3799978808U, // <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6> + 2726237006U, // <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1> + 2224677522U, // <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1> + 2726237176U, // <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0> + 2285815462U, // <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1> + 3805951193U, // <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0> + 3807941859U, // <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1> + 3799979366U, // <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6> + 3803297165U, // <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0> + 3799979540U, // <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0> + 3799979628U, // <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7> + 2731546240U, // <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0> + 2284494848U, // <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0> + 1683112594U, // <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1> + 1683112605U, // <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2734200772U, // <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0> + 2757075629U, // <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1> + 2686425242U, // <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2791948430U, // <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 2736855304U, // <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0> + 1683112659U, // <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1610694666U, // <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1> + 1616003174U, // <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS + 2283767958U, // <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2> + 3357507596U, // <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3> + 2689745234U, // <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5> + 3357507922U, // <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5> + 3294397647U, // <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7> + 3373433334U, // <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7> + 1616003730U, // <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1> + 1550221414U, // <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,1,1>: Cost 1 vspltisw1 LHS + 2287093910U, // <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2> + 2287092615U, // <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3> + 1550224694U, // <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 2287092050U, // <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5> + 2689746127U, // <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7> + 2659800138U, // <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1> + 269271142U, // <1,1,1,u>: Cost 1 vspltisw1 LHS + 2222113516U, // <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1> + 2756854663U, // <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3> + 1148371862U, // <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689746598U, // <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1> + 2618002742U, // <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS + 2299707730U, // <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5> + 2689746874U, // <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7> + 3361506511U, // <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7> + 1148371862U, // <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689747094U, // <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2> + 2691074278U, // <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1> + 3356870806U, // <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2> + 2283126958U, // <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3> + 2689747458U, // <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6> + 3356868946U, // <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5> + 3811265144U, // <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7> + 3362841807U, // <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7> + 2689747742U, // <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2> + 2623987814U, // <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS + 2758181931U, // <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5> + 2223408022U, // <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 3697731734U, // <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2> + 2283798784U, // <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4> + 1616006454U, // <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 3297379535U, // <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7> + 3373466102U, // <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7> + 1616006697U, // <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2760762479U, // <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1> + 2284470282U, // <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1> + 2284472470U, // <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2> + 3358212270U, // <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3> + 2284470285U, // <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4> + 1210728786U, // <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2737524834U, // <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0> + 3360867535U, // <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7> + 1210728786U, // <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 3697746022U, // <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS + 2756854991U, // <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7> + 2737525242U, // <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3> + 3839149281U, // <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7> + 3697749302U, // <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS + 3356893522U, // <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5> + 2283151537U, // <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6> + 2791949566U, // <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0> + 2792613127U, // <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0> + 2737525754U, // <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2> + 2291786386U, // <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1> + 3365528292U, // <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2> + 3365528455U, // <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3> + 2737526118U, // <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6> + 3365527890U, // <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5> + 3365528377U, // <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6> + 2291786959U, // <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7> + 2737526402U, // <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2> + 1550221414U, // <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,u,1>: Cost 1 vspltisw1 LHS + 1148371862U, // <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689750972U, // <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1> + 1550224694U, // <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1616009370U, // <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2689751248U, // <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7> + 2736863497U, // <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1> + 269271142U, // <1,1,u,u>: Cost 1 vspltisw1 LHS + 2702360576U, // <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0> + 1628618854U, // <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2685771949U, // <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2> + 2283765862U, // <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 2702360914U, // <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5> + 3788046813U, // <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0> + 2688426481U, // <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2> + 2726249024U, // <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1628619421U, // <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2690417380U, // <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2> + 2702361396U, // <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1> + 2287093352U, // <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2> + 1213349990U, // <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 3764159522U, // <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5> + 3295053672U, // <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6> + 2221311930U, // <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7> + 3799991593U, // <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7> + 1213349995U, // <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS + 2624045158U, // <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS + 2702362144U, // <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2> + 2283120232U, // <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2> + 1225965670U, // <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2624048438U, // <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS + 3356860763U, // <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5> + 2222114746U, // <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7> + 2299708632U, // <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7> + 1225965675U, // <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS + 470597734U, // <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544340276U, // <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544341096U, // <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544341916U, // <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3> + 470601014U, // <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592119300U, // <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592119802U, // <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592120314U, // <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470603566U, // <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708335471U, // <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2> + 3838043908U, // <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5> + 3357541992U, // <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2> + 2283798630U, // <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2726251728U, // <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4> + 1628622134U, // <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 3297077178U, // <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7> + 2726251976U, // <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1628622377U, // <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 2714308168U, // <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2> + 3297633827U, // <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5> + 2284471912U, // <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2> + 1210728550U, // <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 3776106420U, // <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6> + 2726252548U, // <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5> + 2726252642U, // <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0> + 3799994538U, // <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0> + 1210728555U, // <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720280865U, // <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2> + 2702365096U, // <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2> + 2726253050U, // <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3> + 2283151462U, // <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 3697823030U, // <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS + 3298715497U, // <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7> + 2726253368U, // <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6> + 2724926296U, // <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2> + 2283151467U, // <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652511738U, // <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2> + 3371500916U, // <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1> + 3365529192U, // <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2> + 2291785830U, // <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2726253926U, // <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6> + 3788051845U, // <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1> + 3794023894U, // <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1> + 2726254119U, // <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1> + 1657820802U, // <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2> + 470638699U, // <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1544381236U, // <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544382056U, // <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544382614U, // <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 470641974U, // <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1628625050U, // <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 1592160762U, // <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592161274U, // <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470644526U, // <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS + 2769389708U, // <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1> + 2685780070U, // <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2685780142U, // <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3> + 2686443775U, // <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3> + 2769684656U, // <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1> + 3357507940U, // <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5> + 3759522294U, // <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7> + 3357509562U, // <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7> + 2685780637U, // <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2287092630U, // <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0> + 2221312230U, // <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1> + 2691752839U, // <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3> + 2287093362U, // <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3> + 2287092634U, // <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4> + 3360835107U, // <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5> + 3759523041U, // <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7> + 2287093690U, // <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7> + 2287092638U, // <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u> + 2222114966U, // <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2> + 2222115057U, // <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3> + 2630092320U, // <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2> + 2685781670U, // <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1> + 2222115330U, // <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6> + 3373449572U, // <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5> + 2222115448U, // <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2299709370U, // <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7> + 2222115614U, // <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2> + 2771380607U, // <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1> + 3356874468U, // <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1> + 3759524168U, // <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0> + 2283792796U, // <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3> + 3356869530U, // <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4> + 3721760428U, // <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3> + 3296496248U, // <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7> + 3356870586U, // <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7> + 2771970503U, // <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1> + 2772044240U, // <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1> + 3362186135U, // <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1> + 3297151280U, // <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3> + 3357542002U, // <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3> + 3357540626U, // <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4> + 2685783350U, // <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 3357546622U, // <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6> + 3357542330U, // <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7> + 2685783593U, // <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2284471190U, // <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0> + 3358213015U, // <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1> + 2630116899U, // <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5> + 2284471922U, // <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3> + 2284471194U, // <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4> + 2284471843U, // <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5> + 3358218366U, // <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6> + 2284472250U, // <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7> + 2284471198U, // <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u> + 2224752790U, // <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2> + 3832736385U, // <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7> + 3703866916U, // <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6> + 3356894834U, // <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3> + 3356894106U, // <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4> + 3356894755U, // <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5> + 3356899130U, // <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6> + 2283153338U, // <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2283153338U, // <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2774035139U, // <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1> + 3703874767U, // <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7> + 3703875109U, // <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7> + 3365529202U, // <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3> + 3365528474U, // <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4> + 3789387159U, // <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1> + 3865692927U, // <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7> + 3363538874U, // <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7> + 2774625035U, // <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1> + 2284495766U, // <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0> + 2685785902U, // <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2630141478U, // <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u> + 2283169880U, // <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3> + 2284495770U, // <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4> + 2685786266U, // <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2222115448U, // <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2284496826U, // <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7> + 2685786469U, // <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2684461069U, // <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4> + 2686451814U, // <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 3759530159U, // <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4> + 2686451968U, // <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2684461394U, // <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5> + 1701989266U, // <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1> + 3776119286U, // <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7> + 2689106500U, // <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4> + 1702210477U, // <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1> + 2221312914U, // <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1> + 2691097399U, // <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4> + 3760194454U, // <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0> + 3766166489U, // <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4> + 2334870736U, // <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4> + 1147571510U, // <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 3760194794U, // <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7> + 3867315188U, // <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0> + 1147571753U, // <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS + 2222115730U, // <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1> + 2222115812U, // <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 3760195176U, // <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2> + 2702378662U, // <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1> + 2323598544U, // <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4> + 1148374326U, // <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 3760195514U, // <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7> + 3373451932U, // <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7> + 1148374569U, // <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS + 2702379160U, // <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4> + 3760195840U, // <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0> + 3776121160U, // <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0> + 3760195996U, // <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3> + 2686454274U, // <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6> + 3356870350U, // <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5> + 3800009392U, // <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0> + 3366824604U, // <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7> + 2707688224U, // <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4> + 2775731368U, // <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0> + 3830820018U, // <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1> + 3691980454U, // <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1> + 3357541282U, // <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3> + 2781039824U, // <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4> + 2686455094U, // <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 3357541528U, // <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6> + 3810627020U, // <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4> + 2686455337U, // <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 2624217190U, // <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS + 2284470309U, // <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1> + 2618246822U, // <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1> + 3358212297U, // <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3> + 2284470312U, // <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4> + 2284470637U, // <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5> + 1683115318U, // <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3721851898U, // <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2> + 1683115336U, // <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3794039075U, // <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4> + 3830820186U, // <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7> + 3800011258U, // <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3> + 3807973938U, // <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5> + 3298716880U, // <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4> + 2224680246U, // <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 3800011576U, // <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6> + 2726269774U, // <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1> + 2224680489U, // <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726269948U, // <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4> + 3383444141U, // <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1> + 3805983961U, // <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0> + 3807974667U, // <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5> + 2736887142U, // <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6> + 3365528403U, // <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5> + 3800012308U, // <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0> + 3800012396U, // <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7> + 2731579012U, // <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4> + 2624241766U, // <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS + 2686457646U, // <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 2618271398U, // <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1> + 2734233544U, // <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4> + 2689775679U, // <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6> + 1152355638U, // <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS + 1683115561U, // <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2736888076U, // <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4> + 1683115579U, // <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2687123456U, // <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0> + 1613381734U, // <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759538352U, // <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5> + 3760865532U, // <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0> + 1613381970U, // <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2687787427U, // <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5> + 2781777524U, // <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1> + 3733828717U, // <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0> + 1613382301U, // <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2781040271U, // <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1> + 2687124276U, // <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1> + 2687124374U, // <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0> + 3760866297U, // <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0> + 2693096491U, // <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5> + 2687124591U, // <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1> + 2687124723U, // <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7> + 3360834803U, // <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7> + 2687124860U, // <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0> + 2323598792U, // <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0> + 2687125027U, // <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5> + 2687125096U, // <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2> + 2687125158U, // <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1> + 2642185188U, // <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2> + 2323598554U, // <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5> + 2687125434U, // <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7> + 3373450483U, // <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7> + 2687125563U, // <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1> + 2687125654U, // <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2> + 2312990234U, // <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1> + 3760867649U, // <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2> + 2687125916U, // <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3> + 2687126018U, // <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6> + 3386731738U, // <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5> + 3356871170U, // <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6> + 3808643779U, // <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1> + 2687126302U, // <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2> + 2642198630U, // <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS + 2687126498U, // <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0> + 3715941923U, // <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5> + 3709970701U, // <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4> + 2687126736U, // <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4> + 1613385014U, // <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2283801090U, // <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 3733861489U, // <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4> + 1613385257U, // <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2624290918U, // <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS + 2624291676U, // <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5> + 3698034211U, // <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5> + 2284471211U, // <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3> + 2624294198U, // <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS + 2284471132U, // <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5> + 2284472834U, // <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6> + 2284471539U, // <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7> + 2284471216U, // <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u> + 2785316900U, // <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1> + 2781040691U, // <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7> + 2734903802U, // <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3> + 3848736834U, // <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4> + 3298717620U, // <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6> + 3298717700U, // <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5> + 2734904120U, // <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6> + 2781040738U, // <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0> + 2781040747U, // <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0> + 2734904314U, // <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2> + 2315677210U, // <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1> + 3808646292U, // <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3> + 3808646371U, // <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1> + 2734904678U, // <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6> + 3389418714U, // <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5> + 3365528656U, // <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6> + 2734904940U, // <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7> + 2734904962U, // <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2> + 2687129299U, // <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2> + 1613387566U, // <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2687129480U, // <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3> + 2687129532U, // <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1> + 1661163546U, // <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5> + 1613387930U, // <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2687129808U, // <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7> + 2781040900U, // <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0> + 1613388133U, // <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759546368U, // <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0> + 2685804646U, // <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2685804721U, // <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6> + 3861270834U, // <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1> + 3759546706U, // <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5> + 2687795620U, // <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6> + 2688459253U, // <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6> + 2283769142U, // <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 2685805213U, // <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 3698073702U, // <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS + 3759547188U, // <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1> + 2221314554U, // <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3> + 3759547401U, // <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7> + 3698076982U, // <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS + 3767510141U, // <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6> + 2334872376U, // <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6> + 1213353270U, // <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 1213353271U, // <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS + 3704053862U, // <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS + 3759547961U, // <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0> + 2222117370U, // <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3> + 3759548070U, // <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1> + 3704057142U, // <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS + 3373451057U, // <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5> + 2685806522U, // <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7> + 1225968950U, // <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1225968951U, // <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS + 3759548566U, // <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2> + 3842912793U, // <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7> + 3759548774U, // <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3> + 3759548828U, // <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3> + 3759548930U, // <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6> + 3809315421U, // <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7> + 3386733368U, // <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6> + 2283130166U, // <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS + 2283130167U, // <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS + 3704070246U, // <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS + 3862229608U, // <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5> + 3704071741U, // <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4> + 3721988610U, // <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6> + 3704073526U, // <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS + 2685807926U, // <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3865621141U, // <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5> + 2283801910U, // <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 2685808169U, // <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3710050406U, // <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS + 3710051571U, // <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7> + 3405989597U, // <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2> + 3358214502U, // <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3> + 3710053686U, // <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS + 3721998025U, // <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5> + 2332250936U, // <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6> + 1210731830U, // <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210731831U, // <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS + 2791289597U, // <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1> + 3698115430U, // <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6> + 3698116538U, // <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7> + 3356894132U, // <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3> + 3698117942U, // <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS + 3722006218U, // <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6> + 2781041464U, // <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6> + 2283154742U, // <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283154743U, // <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS + 1718211406U, // <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1> + 2792026967U, // <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1> + 2765411170U, // <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3> + 3854783336U, // <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0> + 2781041526U, // <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5> + 3365528664U, // <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5> + 2791953290U, // <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7> + 2291789110U, // <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1718801302U, // <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1> + 1718875039U, // <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1> + 2685810478U, // <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2792764337U, // <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1> + 3759552444U, // <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1> + 2781041607U, // <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5> + 2685810842U, // <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 2689792208U, // <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7> + 1210756406U, // <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 1210756407U, // <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS + 2793280496U, // <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1> + 2694439014U, // <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 3393343912U, // <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2> + 3397325306U, // <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3> + 2793575444U, // <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1> + 3722030797U, // <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0> + 2688467446U, // <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7> + 2689131079U, // <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7> + 2694439570U, // <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1> + 2654265354U, // <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1> + 2794017866U, // <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1> + 3768181639U, // <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3> + 2334872058U, // <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3> + 2654268726U, // <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS + 3792069797U, // <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1> + 2694440143U, // <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7> + 2334872386U, // <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7> + 2695767409U, // <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7> + 2654273638U, // <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2222117973U, // <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3> + 2299711912U, // <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2654275734U, // <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2> + 2654276918U, // <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS + 3385397675U, // <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5> + 2654278056U, // <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2> + 2323599627U, // <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7> + 2654279470U, // <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2795271395U, // <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1> + 3768183059U, // <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1> + 3728025254U, // <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1> + 3768183196U, // <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3> + 3768183298U, // <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6> + 3792071255U, // <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1> + 3780127361U, // <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7> + 3847779617U, // <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0> + 2795861291U, // <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1> + 2795935028U, // <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1> + 3728032975U, // <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7> + 3839153480U, // <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3> + 3397358074U, // <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3> + 3854783835U, // <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4> + 2694442294U, // <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 3786100058U, // <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7> + 3722065254U, // <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6> + 2694442537U, // <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654298214U, // <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS + 3854783893U, // <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u> + 3710126010U, // <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7> + 2332250618U, // <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3> + 2654301494U, // <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS + 2284474795U, // <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5> + 2718330931U, // <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7> + 2332250946U, // <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7> + 2719658197U, // <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7> + 2332921954U, // <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0> + 3768185254U, // <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0> + 3710134202U, // <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7> + 3710134561U, // <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6> + 3710135606U, // <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS + 3864884745U, // <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7> + 3854784017U, // <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6> + 2791953940U, // <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0> + 2792617501U, // <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0> + 2797925927U, // <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1> + 3365528426U, // <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1> + 3728058022U, // <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1> + 3365528509U, // <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3> + 3854784079U, // <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5> + 3722088148U, // <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7> + 3728060845U, // <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7> + 2781042284U, // <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7> + 2798515823U, // <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1> + 2654322705U, // <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u> + 2694444846U, // <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 2299711912U, // <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2323649018U, // <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3> + 2654326070U, // <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS + 2694445210U, // <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654327214U, // <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u> + 2323649346U, // <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7> + 2694445413U, // <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 1610752017U, // <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u> + 1613406310U, // <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 2685821107U, // <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u> + 2283765916U, // <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 1613406549U, // <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u> + 1725880054U, // <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1> + 2688475639U, // <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u> + 2283769160U, // <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 1613406877U, // <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 1550221414U, // <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,u,1,1>: Cost 1 vspltisw1 LHS + 1683117870U, // <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1213350044U, // <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 1550224694U, // <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1147574426U, // <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 2687149326U, // <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7> + 1213353288U, // <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 269271142U, // <1,u,1,u>: Cost 1 vspltisw1 LHS + 2222118611U, // <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2> + 1148376878U, // <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 1148371862U, // <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 1225965724U, // <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2222118975U, // <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6> + 1148377242U, // <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 2687150010U, // <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7> + 1225968968U, // <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1148377445U, // <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 471040156U, // <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544782644U, // <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544783464U, // <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544784022U, // <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471043382U, // <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592561668U, // <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592562170U, // <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592562682U, // <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 471045934U, // <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708384629U, // <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u> + 2687151101U, // <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0> + 2223408022U, // <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 2283798684U, // <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2642422785U, // <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4> + 1613409590U, // <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2283801090U, // <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 2283801928U, // <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 1613409833U, // <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2284471235U, // <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0> + 2284472046U, // <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1> + 2284472533U, // <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2> + 1210728604U, // <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 2284471239U, // <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4> + 1210728786U, // <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 1683118234U, // <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210731848U, // <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210728609U, // <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720330023U, // <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u> + 2757376190U, // <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7> + 2726302202U, // <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3> + 2283151516U, // <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 2224972114U, // <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 2224683162U, // <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726302520U, // <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6> + 2283154760U, // <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283151521U, // <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652560896U, // <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u> + 2333590225U, // <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1> + 2765412628U, // <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3> + 2291785884U, // <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2781042984U, // <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5> + 3365527953U, // <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5> + 2791954748U, // <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7> + 2291789128U, // <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1657869960U, // <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u> + 471081121U, // <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 269271142U, // <1,u,u,1>: Cost 1 vspltisw1 LHS + 1544824424U, // <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544824982U, // <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471084342U, // <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 1613412506U, // <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 1683118477U, // <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210756424U, // <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 471086894U, // <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS + 2226757632U, // <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0> + 2226757734U, // <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS + 3826622483U, // <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1> + 3843211292U, // <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1> + 3300499794U, // <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5> + 3356256724U, // <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5> + 3825664056U, // <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2> + 3762889289U, // <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0> + 2226758301U, // <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS + 2227429386U, // <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1> + 2227429478U, // <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS + 1691156582U, // <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2666358997U, // <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2> + 2227462482U, // <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5> + 3722186464U, // <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1> + 3867099278U, // <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7> + 3366881912U, // <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7> + 1691156636U, // <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2228027392U, // <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0> + 1154285670U, // <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 2228027565U, // <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2> + 3301769468U, // <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0> + 2228027730U, // <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5> + 3301769635U, // <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5> + 3780806586U, // <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7> + 3368880760U, // <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7> + 1154286237U, // <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS + 1213440000U, // <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1213441702U, // <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2228535470U, // <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3> + 2636515632U, // <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3> + 2287182962U, // <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4> + 2660405346U, // <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0> + 2228535798U, // <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660406420U, // <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3> + 1213441709U, // <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3368894464U, // <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0> + 2764898642U, // <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5> + 3826622811U, // <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5> + 3843211620U, // <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5> + 3838640493U, // <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5> + 2732944694U, // <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS + 3797396857U, // <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2> + 3867099528U, // <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5> + 2764898705U, // <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5> + 3364257792U, // <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0> + 2230124646U, // <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 3304235184U, // <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5> + 3364260144U, // <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3> + 3303817554U, // <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5> + 3364260146U, // <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5> + 3867099602U, // <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7> + 3364260472U, // <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7> + 2230125213U, // <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS + 2230796288U, // <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0> + 1157054566U, // <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 2230796465U, // <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6> + 3304538364U, // <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0> + 2230796626U, // <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5> + 3797398205U, // <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0> + 3304538614U, // <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7> + 3798725471U, // <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0> + 1157055133U, // <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 3371573248U, // <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0> + 2231189606U, // <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS + 3801380003U, // <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0> + 3802043636U, // <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0> + 3806688614U, // <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6> + 3356317308U, // <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5> + 3804034535U, // <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0> + 3806688876U, // <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7> + 2231190173U, // <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS + 1208836096U, // <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1208837798U, // <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 1691157149U, // <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2636556597U, // <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u> + 2282579625U, // <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2660446306U, // <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0> + 2228535798U, // <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660447385U, // <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u> + 1208837805U, // <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3692388523U, // <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0> + 2757526244U, // <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2> + 2330290974U, // <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2> + 3843212020U, // <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0> + 3692391734U, // <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS + 3300533362U, // <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4> + 3794084337U, // <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2> + 3374170614U, // <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7> + 2758042403U, // <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2> + 2690482924U, // <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1> + 2764899124U, // <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1> + 2695791510U, // <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0> + 3362235271U, // <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3> + 3692399926U, // <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS + 3832226649U, // <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2> + 3301205235U, // <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7> + 3768870179U, // <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1> + 2695791988U, // <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1> + 2618663085U, // <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2> + 2228028212U, // <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1> + 2618664552U, // <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2> + 2759000984U, // <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2> + 2618666294U, // <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS + 2295136594U, // <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5> + 3769534376U, // <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7> + 2793358266U, // <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0> + 2618668846U, // <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS + 2282536969U, // <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208795146U, // <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1213442198U, // <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2287181998U, // <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2618674486U, // <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS + 1208795474U, // <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2287182001U, // <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287183055U, // <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208795153U, // <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 3692421295U, // <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4> + 3838641195U, // <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5> + 2330323742U, // <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2> + 3692423318U, // <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2> + 3692424502U, // <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS + 2695793974U, // <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3799395705U, // <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2> + 3368895695U, // <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7> + 2695794217U, // <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3692429488U, // <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5> + 3364257802U, // <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1> + 3692431253U, // <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6> + 3692431874U, // <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6> + 3692432694U, // <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS + 3364258130U, // <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5> + 3303875827U, // <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7> + 3867100333U, // <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0> + 3692435246U, // <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS + 2618695857U, // <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6> + 2230797108U, // <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1> + 2618697658U, // <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7> + 3692439702U, // <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2> + 2618699062U, // <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS + 3364929874U, // <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5> + 3692442424U, // <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6> + 3798733664U, // <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1> + 2618701614U, // <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS + 3799397370U, // <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2> + 3371573258U, // <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1> + 2330351234U, // <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 3799397658U, // <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2> + 3799397734U, // <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6> + 3371573586U, // <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5> + 3799397870U, // <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7> + 3799397956U, // <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3> + 2330351234U, // <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 2282577929U, // <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208836106U, // <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1208838294U, // <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282578094U, // <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282577933U, // <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1208836434U, // <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282578097U, // <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287224015U, // <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208836113U, // <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 2226759117U, // <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0> + 1624047718U, // <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 2697789613U, // <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2> + 2226767526U, // <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1> + 2697789778U, // <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5> + 3300657000U, // <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6> + 2226988986U, // <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7> + 3734271139U, // <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0> + 1624048285U, // <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 3831268868U, // <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1> + 2293138804U, // <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1> + 2697790358U, // <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0> + 2293137510U, // <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 3771532331U, // <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5> + 3767551106U, // <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2> + 3301173178U, // <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7> + 3372853169U, // <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7> + 2293137515U, // <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS + 1556938854U, // <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2295137733U, // <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1> + 336380006U, // <2,2,2,2>: Cost 1 vspltisw2 LHS + 1221394534U, // <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS + 1556942134U, // <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2228029370U, // <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7> + 2660545701U, // <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2> + 336380006U, // <2,2,2,u>: Cost 1 vspltisw2 LHS + 2697791638U, // <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2> + 2765489840U, // <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2> + 1213441640U, // <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135053414U, // <2,2,3,3>: Cost 1 vmrglw LHS, LHS + 2697792002U, // <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6> + 2330313780U, // <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5> + 2287183549U, // <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6> + 2660553894U, // <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3> + 135053419U, // <2,2,3,u>: Cost 1 vmrglw LHS, LHS + 2630697062U, // <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS + 3771534282U, // <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3> + 2764900109U, // <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5> + 2295152742U, // <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS + 2295154282U, // <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4> + 1624050998U, // <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 2229675962U, // <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7> + 3368896433U, // <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7> + 1624051241U, // <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 3771534920U, // <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2> + 3364258540U, // <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1> + 2296489576U, // <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2> + 2290516070U, // <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 3771535284U, // <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6> + 2290517044U, // <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5> + 2697793634U, // <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0> + 3370231729U, // <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7> + 2290516075U, // <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS + 2230797801U, // <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1> + 3304539679U, // <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1> + 2764900273U, // <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7> + 2764900282U, // <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7> + 2230798129U, // <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5> + 3304540008U, // <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6> + 1157056442U, // <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725000033U, // <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2> + 1157056442U, // <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2793359338U, // <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1> + 3371574725U, // <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1> + 2297833064U, // <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2> + 2297831526U, // <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 2697794918U, // <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6> + 3371575053U, // <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5> + 3304933297U, // <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7> + 2297833393U, // <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7> + 2297831531U, // <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS + 1556938854U, // <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1624053550U, // <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 336380006U, // <2,2,u,2>: Cost 1 vspltisw2 LHS + 135094374U, // <2,2,u,3>: Cost 1 vmrglw LHS, LHS + 1556942134U, // <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1624053914U, // <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 1157056442U, // <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2660594859U, // <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u> + 135094379U, // <2,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611448320U, // <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537706598U, // <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2689835181U, // <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2689835260U, // <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611448658U, // <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2732966354U, // <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2732966390U, // <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660603052U, // <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0> + 537707165U, // <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689835748U, // <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611449140U, // <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611449238U, // <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 3763577805U, // <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1> + 2689836112U, // <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689836143U, // <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689836239U, // <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 3366881210U, // <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7> + 1616094588U, // <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2689836493U, // <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0> + 2685191711U, // <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611449960U, // <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611450022U, // <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2689836822U, // <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5> + 2689836904U, // <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6> + 1611450298U, // <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2295138234U, // <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7> + 1611450456U, // <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3> + 1213440918U, // <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282538527U, // <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1> + 1557022322U, // <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3> + 1208796786U, // <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1213440922U, // <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282538531U, // <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2287188094U, // <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1213441978U, // <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 1208796791U, // <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u> + 1551056998U, // <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS + 1551057818U, // <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4> + 2624800360U, // <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2> + 2624800918U, // <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2> + 1551060278U, // <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS + 537709878U, // <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2732969337U, // <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2660635824U, // <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4> + 537710121U, // <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689838664U, // <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2732969615U, // <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2732969707U, // <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3> + 3763580721U, // <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1> + 2689839028U, // <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659228164U, // <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659228258U, // <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 3364259770U, // <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7> + 1659228420U, // <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2230798486U, // <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2> + 2732970407U, // <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1659228666U, // <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2230798748U, // <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3> + 2230798850U, // <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6> + 2732970731U, // <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659228984U, // <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659229006U, // <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1659229087U, // <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1> + 1659229178U, // <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2726999125U, // <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3> + 2727662758U, // <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3> + 2732971235U, // <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1> + 1659229542U, // <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2732971446U, // <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2732971484U, // <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7> + 1659229804U, // <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659229826U, // <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1208837014U, // <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 537712430U, // <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1616099205U, // <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0> + 1208837746U, // <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1208837018U, // <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 537712794U, // <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1616099536U, // <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1208838074U, // <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 537712997U, // <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 3771547648U, // <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0> + 2697805926U, // <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS + 3770884269U, // <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2> + 3806716164U, // <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u> + 3771547986U, // <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5> + 2226761014U, // <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 3853462427U, // <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1> + 3867102116U, // <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1> + 2226761257U, // <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS + 3849186231U, // <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2> + 3301207010U, // <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0> + 3766240150U, // <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0> + 3766240226U, // <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4> + 3301207248U, // <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4> + 2227432758U, // <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS + 3758941400U, // <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7> + 3768894758U, // <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4> + 2227433001U, // <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS + 2228030354U, // <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1> + 3770885657U, // <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4> + 2697807466U, // <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4> + 3368880468U, // <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3> + 2228030672U, // <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4> + 1154288950U, // <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 3771549617U, // <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7> + 3368880796U, // <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7> + 1154289193U, // <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS + 2636808294U, // <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS + 2287181861U, // <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1> + 2228866102U, // <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3> + 2636810580U, // <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3> + 1256574160U, // <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1213441742U, // <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2228866430U, // <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7> + 2660701368U, // <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3> + 1213441745U, // <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3704586342U, // <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS + 3782831051U, // <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4> + 3704587900U, // <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4> + 3368896123U, // <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3> + 2793360592U, // <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4> + 2697809206U, // <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 3303198078U, // <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7> + 3867102444U, // <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5> + 2697809449U, // <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 2630852710U, // <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS + 2624881572U, // <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5> + 2630854269U, // <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5> + 2666686677U, // <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2> + 2630855990U, // <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS + 2230127926U, // <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS + 1691159862U, // <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 3867102520U, // <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0> + 1691159880U, // <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230799250U, // <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1> + 3304541130U, // <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3> + 2230799417U, // <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6> + 3304541323U, // <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7> + 2230799568U, // <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4> + 1157057846U, // <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 3304541566U, // <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7> + 3798758243U, // <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4> + 1157058089U, // <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS + 3806721018U, // <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2> + 3853831590U, // <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2> + 3801412775U, // <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4> + 3802076408U, // <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4> + 3401436368U, // <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4> + 2793360840U, // <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0> + 3804067307U, // <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4> + 3867102682U, // <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0> + 2793360867U, // <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0> + 2630877286U, // <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS + 2282580144U, // <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2630878848U, // <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u> + 2636851545U, // <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u> + 1256615120U, // <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1208837838U, // <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 1691160105U, // <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2660742333U, // <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u> + 1208837841U, // <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3766910976U, // <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0> + 2693169254U, // <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3760939181U, // <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2> + 3843214936U, // <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0> + 3760939355U, // <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5> + 3867102827U, // <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1> + 3867102836U, // <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1> + 3867102844U, // <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0> + 2693169821U, // <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3766911724U, // <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1> + 3766911796U, // <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1> + 2693170070U, // <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0> + 3384798262U, // <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3> + 2693170228U, // <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5> + 3301208068U, // <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5> + 3366879607U, // <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6> + 3867102925U, // <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0> + 2695824760U, // <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5> + 2642845798U, // <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS + 2295139218U, // <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1> + 2699142760U, // <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2> + 3766912678U, // <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1> + 2699142925U, // <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5> + 2228031492U, // <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5> + 2295138818U, // <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6> + 3368879347U, // <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7> + 2295138820U, // <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u> + 2287184866U, // <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256573842U, // <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642855630U, // <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5> + 2287182763U, // <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287184870U, // <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256574170U, // <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1213442562U, // <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287183091U, // <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1213442564U, // <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3716604006U, // <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS + 3716604822U, // <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0> + 3766914099U, // <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0> + 3368895403U, // <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3> + 3716607031U, // <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4> + 2693172534U, // <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3363588610U, // <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6> + 3368895731U, // <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7> + 2693172777U, // <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3704668262U, // <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS + 3704669078U, // <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0> + 3704669830U, // <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5> + 3364259460U, // <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3> + 3704671542U, // <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS + 2793361412U, // <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 3364258167U, // <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6> + 3867103249U, // <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0> + 2793361412U, // <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 2642878566U, // <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS + 3386166810U, // <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1> + 2723033594U, // <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3> + 3848523842U, // <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4> + 2723033713U, // <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5> + 2230800388U, // <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5> + 2230800482U, // <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0> + 2785841252U, // <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2> + 2785914989U, // <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2> + 3796775930U, // <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2> + 3800757335U, // <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5> + 3853463689U, // <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3> + 3796776218U, // <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2> + 3796776294U, // <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6> + 3803411867U, // <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5> + 3371575081U, // <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6> + 3796776516U, // <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3> + 3371575083U, // <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u> + 2287225826U, // <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256614802U, // <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642896590U, // <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5> + 2287223723U, // <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287225830U, // <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256615130U, // <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1208838658U, // <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287224051U, // <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1208838660U, // <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3772227584U, // <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0> + 2698485862U, // <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3759620282U, // <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6> + 3710675299U, // <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0> + 3767583058U, // <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5> + 3378153265U, // <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5> + 3865186637U, // <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1> + 2330291510U, // <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS + 2698486429U, // <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3734569062U, // <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS + 3764929346U, // <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6> + 3772228502U, // <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0> + 3734571158U, // <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2> + 3734572342U, // <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS + 3767583878U, // <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6> + 3768247511U, // <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6> + 2293140790U, // <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 2293140791U, // <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS + 3704717414U, // <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS + 3395424589U, // <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1> + 2228031993U, // <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2> + 2698487485U, // <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6> + 3704720694U, // <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS + 3773556575U, // <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6> + 2698487738U, // <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7> + 1221397814U, // <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 1221397815U, // <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS + 2636955750U, // <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS + 2330314217U, // <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2636957626U, // <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7> + 2287184230U, // <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636959030U, // <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS + 2648903448U, // <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3> + 1256575800U, // <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135056694U, // <2,6,3,7>: Cost 1 vmrglw LHS, RHS + 135056695U, // <2,6,3,u>: Cost 1 vmrglw LHS, RHS + 3710705766U, // <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS + 3698762677U, // <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4> + 3710707389U, // <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6> + 3710708071U, // <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4> + 3710709046U, // <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS + 2698489142U, // <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 3796782457U, // <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2> + 2295156022U, // <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 2295156023U, // <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS + 3303870753U, // <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2> + 3788820134U, // <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6> + 3779530520U, // <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3> + 3303871026U, // <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5> + 3303871117U, // <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6> + 3791474666U, // <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6> + 3792138299U, // <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6> + 2290519350U, // <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 2290519351U, // <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS + 2631008358U, // <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS + 3372893673U, // <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1> + 2791445264U, // <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2> + 2230800968U, // <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0> + 2631011638U, // <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS + 3372894001U, // <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5> + 2793362232U, // <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6> + 2295835958U, // <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS + 2295835959U, // <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS + 2793362254U, // <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1> + 2792035160U, // <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2> + 2792108897U, // <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2> + 2769474408U, // <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0> + 2793362294U, // <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5> + 3371575089U, // <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5> + 2792403845U, // <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2> + 2297834806U, // <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS + 2297834807U, // <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS + 2636996710U, // <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS + 2698491694U, // <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 2636998631U, // <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7> + 2282580326U, // <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636999990U, // <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS + 2698492058U, // <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 1256616760U, // <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135097654U, // <2,6,u,7>: Cost 1 vmrglw LHS, RHS + 135097655U, // <2,6,u,u>: Cost 1 vmrglw LHS, RHS + 2666864742U, // <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS + 1719620602U, // <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2> + 3768254637U, // <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2> + 3393417722U, // <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3> + 2666868022U, // <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS + 3867104290U, // <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6> + 3728667127U, // <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0> + 2666869817U, // <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2> + 1720136761U, // <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2> + 3728670822U, // <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS + 3774227252U, // <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1> + 3774227350U, // <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0> + 2323001850U, // <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 3728674102U, // <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS + 3774227567U, // <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1> + 2694513880U, // <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7> + 3396744002U, // <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7> + 2323001850U, // <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 2654937190U, // <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS + 3728679732U, // <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1> + 2700486248U, // <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2> + 2321682938U, // <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3> + 2654940470U, // <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS + 3859584196U, // <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6> + 2700486577U, // <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7> + 2228033132U, // <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7> + 2701813843U, // <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7> + 1581203558U, // <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 2654946100U, // <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1> + 2637031354U, // <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7> + 1256575482U, // <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581206838U, // <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS + 2654949380U, // <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5> + 1581208058U, // <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3> + 1256575810U, // <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581209390U, // <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 3728695398U, // <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS + 3869758782U, // <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2> + 3728696936U, // <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2> + 3393450490U, // <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3> + 3728698678U, // <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS + 2700487990U, // <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3728699899U, // <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4> + 3867104626U, // <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0> + 2700488233U, // <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3855160709U, // <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1> + 3728704406U, // <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0> + 3370233956U, // <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2> + 2320380410U, // <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 3728706870U, // <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS + 3867104694U, // <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5> + 3792146492U, // <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7> + 3394122562U, // <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7> + 2320380410U, // <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 2230801402U, // <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2> + 3768258984U, // <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2> + 2730349050U, // <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 3372894575U, // <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3> + 2230801766U, // <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6> + 3304543670U, // <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5> + 3728716285U, // <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6> + 2230802028U, // <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7> + 2730349050U, // <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 2793362983U, // <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1> + 3728721112U, // <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7> + 3371574933U, // <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2> + 2327695866U, // <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3> + 3728723254U, // <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS + 3371574855U, // <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5> + 2730350062U, // <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7> + 2793363052U, // <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7> + 2798671471U, // <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1> + 1581244518U, // <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1724929666U, // <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2> + 2637072314U, // <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7> + 1256616442U, // <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581247798U, // <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS + 2700490906U, // <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 1581249023U, // <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u> + 1256616770U, // <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581250350U, // <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1611489280U, // <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537747563U, // <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685231277U, // <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685231356U, // <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611489618U, // <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2226763930U, // <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 2733007350U, // <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660971737U, // <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0> + 537748125U, // <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689876708U, // <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611490100U, // <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611490198U, // <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 2293137564U, // <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 2689877072U, // <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689877103U, // <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689877199U, // <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2293140808U, // <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 1616135548U, // <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 1556938854U, // <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1154291502U, // <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 336380006U, // <2,u,2,2>: Cost 1 vspltisw2 LHS + 1611490982U, // <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 1556942134U, // <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1154291866U, // <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 1611491258U, // <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1221397832U, // <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 336380006U, // <2,u,2,u>: Cost 1 vspltisw2 LHS + 1611491478U, // <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2> + 1213440073U, // <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1> + 1213442261U, // <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135053468U, // <2,u,3,3>: Cost 1 vmrglw LHS, LHS + 1611491842U, // <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6> + 1213440401U, // <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5> + 1213442589U, // <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135056712U, // <2,u,3,7>: Cost 1 vmrglw LHS, RHS + 135053473U, // <2,u,3,u>: Cost 1 vmrglw LHS, LHS + 1551425638U, // <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS + 1551426503U, // <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4> + 2625169000U, // <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2> + 2625169558U, // <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2> + 1551428918U, // <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS + 537750838U, // <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2733010297U, // <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2295156040U, // <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 537751081U, // <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689879624U, // <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2230130478U, // <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 2631149217U, // <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5> + 2290516124U, // <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 2689879988U, // <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659269124U, // <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1691162778U, // <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2290519368U, // <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 1691162796U, // <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230802131U, // <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2> + 1157060398U, // <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659269626U, // <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2764904656U, // <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7> + 2230802495U, // <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6> + 1157060762U, // <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 1659269944U, // <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659269966U, // <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1157060965U, // <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659270138U, // <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2727040090U, // <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u> + 2727703723U, // <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u> + 2297831580U, // <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 1659270502U, // <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2733012406U, // <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2730358255U, // <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u> + 1659270764U, // <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659270786U, // <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1213481923U, // <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0> + 537753390U, // <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 336380006U, // <2,u,u,2>: Cost 1 vspltisw2 LHS + 135094428U, // <2,u,u,3>: Cost 1 vmrglw LHS, LHS + 1213481927U, // <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4> + 537753754U, // <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 1208838685U, // <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135097672U, // <2,u,u,7>: Cost 1 vmrglw LHS, RHS + 135094433U, // <2,u,u,u>: Cost 1 vmrglw LHS, LHS + 1678557184U, // <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1678557194U, // <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2631181989U, // <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0> + 2289223984U, // <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3> + 2756943909U, // <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1> + 3362965729U, // <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5> + 3362966054U, // <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6> + 2289224312U, // <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7> + 1683202121U, // <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1> + 1557446758U, // <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS + 2752741467U, // <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1> + 604815462U, // <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2631190676U, // <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0> + 1557450038U, // <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS + 2667024388U, // <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5> + 2800074894U, // <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7> + 2661053667U, // <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1> + 604815516U, // <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696521165U, // <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0> + 2752741549U, // <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2> + 2691876456U, // <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2> + 2691876518U, // <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1> + 3830685895U, // <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1> + 3765618536U, // <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6> + 2691876794U, // <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7> + 2701166596U, // <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0> + 2756944108U, // <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2> + 2691877014U, // <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2> + 1161003110U, // <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2691877168U, // <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3> + 2691877246U, // <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0> + 2691877378U, // <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6> + 3765619238U, // <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6> + 2691877496U, // <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 3368962680U, // <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7> + 1161003677U, // <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS + 2289254400U, // <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0> + 1678557522U, // <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2631214761U, // <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4> + 2235580672U, // <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 2756944237U, // <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5> + 1618136374U, // <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 3309322742U, // <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7> + 3362998904U, // <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7> + 1683202449U, // <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 3765620296U, // <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2> + 2752299427U, // <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5> + 3789508346U, // <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0> + 3403486842U, // <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3> + 3765620660U, // <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6> + 2733682692U, // <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5> + 2800075218U, // <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7> + 3873817044U, // <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0> + 2800075234U, // <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5> + 2752299501U, // <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7> + 2236547174U, // <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS + 2733683194U, // <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3> + 3844473352U, // <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7> + 3310289234U, // <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5> + 3873817114U, // <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7> + 2733683512U, // <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6> + 2725057384U, // <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0> + 2236547741U, // <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS + 2297905152U, // <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0> + 2297906854U, // <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1> + 2727711916U, // <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0> + 3371649328U, // <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3> + 2733684070U, // <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6> + 3734843490U, // <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0> + 3798799895U, // <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3> + 2733684332U, // <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7> + 2297906861U, // <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u> + 1557504102U, // <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS + 1678557842U, // <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1> + 604816029U, // <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2691880892U, // <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1> + 1557507382U, // <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS + 1618139290U, // <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 2691881168U, // <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7> + 2661111018U, // <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u> + 604816083U, // <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 2619310332U, // <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0> + 2756944612U, // <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2> + 2289221724U, // <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2> + 2619312278U, // <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2> + 2619313462U, // <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS + 2289221970U, // <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5> + 2232599768U, // <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7> + 3362964687U, // <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7> + 2619316014U, // <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS + 2756944683U, // <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1> + 1678558004U, // <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2691883927U, // <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1> + 3826631496U, // <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3> + 2756944723U, // <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5> + 2756944732U, // <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 3830686561U, // <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1> + 3734869228U, // <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1> + 1678558004U, // <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2696529358U, // <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1> + 2756944775U, // <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 2294548630U, // <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2> + 1678558102U, // <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0> + 2631273782U, // <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS + 2756944811U, // <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 3830686644U, // <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3> + 2800075706U, // <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0> + 1679000515U, // <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0> + 2619334911U, // <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3> + 2295218186U, // <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1> + 2293229718U, // <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2> + 2619337116U, // <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3> + 2619338038U, // <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS + 2295218514U, // <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5> + 3830686729U, // <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7> + 3368961231U, // <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7> + 2619340590U, // <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS + 2619343104U, // <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4> + 2289254410U, // <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1> + 2289256598U, // <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2> + 2619345410U, // <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6> + 2619346230U, // <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS + 2756944976U, // <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6> + 3362996401U, // <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6> + 3362997455U, // <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7> + 2619348782U, // <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS + 2756945007U, // <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1> + 3830686840U, // <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1> + 3358361750U, // <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2> + 3830686857U, // <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0> + 2756945047U, // <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5> + 2294571346U, // <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5> + 3806105698U, // <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0> + 3873817774U, // <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1> + 2756945079U, // <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1> + 3830686912U, // <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1> + 2756945103U, // <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2236547990U, // <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0> + 3826631905U, // <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7> + 3830686952U, // <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5> + 2756945139U, // <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 3830686972U, // <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7> + 2800076030U, // <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0> + 2756945166U, // <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7> + 3699081318U, // <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS + 2297905162U, // <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1> + 2297907350U, // <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2> + 3365675182U, // <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3> + 3699084598U, // <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS + 2297905490U, // <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5> + 2297905329U, // <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 3368330447U, // <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7> + 2297905169U, // <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u> + 2619375876U, // <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u> + 1678558004U, // <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2289289366U, // <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2> + 1679000956U, // <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0> + 2619378998U, // <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS + 2756945297U, // <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3> + 2297905329U, // <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 2800076192U, // <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0> + 1683203497U, // <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0> + 3362964203U, // <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0> + 2289222380U, // <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1> + 2289222462U, // <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2> + 1215479910U, // <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 3362964207U, // <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4> + 2289222708U, // <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2232600506U, // <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7> + 3396142296U, // <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7> + 1215479915U, // <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS + 3699105894U, // <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS + 3765633844U, // <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1> + 2691892120U, // <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2> + 2752300575U, // <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1> + 3699109174U, // <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS + 3830687280U, // <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0> + 3830687289U, // <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0> + 3874260548U, // <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2> + 2752742988U, // <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1> + 2631344230U, // <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS + 2697201184U, // <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2> + 1678558824U, // <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678558834U, // <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 2631347510U, // <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS + 3368953613U, // <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5> + 2234304442U, // <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7> + 3368953777U, // <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7> + 1679001247U, // <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3> + 1678558886U, // <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1> + 2752300719U, // <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1> + 2752300729U, // <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2> + 1221476454U, // <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS + 1678558926U, // <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5> + 2800076503U, // <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5> + 2234746810U, // <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7> + 2800076516U, // <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0> + 1678558958U, // <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1> + 3699130470U, // <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS + 3362996972U, // <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1> + 2289256040U, // <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2> + 1215512678U, // <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 3362998676U, // <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4> + 2691894582U, // <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2235582394U, // <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7> + 3734967544U, // <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4> + 1215512683U, // <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS + 3705110630U, // <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS + 3368313985U, // <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1> + 3368314472U, // <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2> + 2756945768U, // <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6> + 3705113910U, // <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS + 3310061416U, // <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6> + 3310135226U, // <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7> + 3370305457U, // <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7> + 2752743317U, // <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6> + 2631376998U, // <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS + 3705119540U, // <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1> + 2631378621U, // <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6> + 1678559162U, // <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2631380278U, // <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS + 3370976956U, // <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5> + 2237065146U, // <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7> + 3798815594U, // <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2> + 1679001575U, // <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 2800076778U, // <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1> + 3371647724U, // <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1> + 2297906792U, // <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2> + 1224163430U, // <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 3705130294U, // <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS + 3371648052U, // <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5> + 2297906877U, // <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6> + 3371648702U, // <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7> + 1224163435U, // <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1679001659U, // <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1> + 2752743492U, // <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1> + 1678558824U, // <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678559320U, // <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3> + 1679001699U, // <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5> + 2691897498U, // <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2237908922U, // <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7> + 2800519289U, // <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0> + 1679001731U, // <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1> + 1215480726U, // <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1678559382U, // <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2> + 2631403200U, // <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0> + 2289223282U, // <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3> + 2752301232U, // <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1> + 3362965027U, // <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5> + 3362965352U, // <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6> + 2289223610U, // <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7> + 1678559445U, // <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2> + 3830687964U, // <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0> + 2752301286U, // <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1> + 2752301297U, // <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3> + 2305157532U, // <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3> + 3830688000U, // <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0> + 3830688009U, // <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0> + 3830688019U, // <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1> + 3362973626U, // <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7> + 2752743719U, // <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3> + 2631417958U, // <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS + 3826043193U, // <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3> + 1624131186U, // <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3> + 2752301384U, // <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0> + 2631421238U, // <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS + 3826485602U, // <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u> + 2752301414U, // <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3> + 2771249519U, // <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3> + 1628112984U, // <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3> + 1563656294U, // <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS + 2301855911U, // <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1> + 2697873730U, // <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3> + 403488870U, // <3,3,3,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 2301856239U, // <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5> + 2697874067U, // <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7> + 2295220154U, // <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7> + 403488870U, // <3,3,3,u>: Cost 1 vspltisw3 LHS + 2289255318U, // <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0> + 2631435162U, // <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4> + 2631435972U, // <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4> + 2289256050U, // <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3> + 1215513498U, // <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679002114U, // <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6> + 3362998120U, // <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6> + 2289256378U, // <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7> + 1679002141U, // <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6> + 3831130657U, // <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1> + 3376277671U, // <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1> + 3771617012U, // <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3> + 2302536092U, // <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3> + 3831130697U, // <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5> + 2294572579U, // <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5> + 2800519773U, // <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7> + 3368314810U, // <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7> + 2800519791U, // <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7> + 2800077432U, // <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7> + 3310291185U, // <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3> + 2789165706U, // <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7> + 2764982931U, // <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7> + 2800077468U, // <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7> + 3873819301U, // <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7> + 2297235304U, // <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6> + 2725081963U, // <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3> + 2725745596U, // <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3> + 2631458918U, // <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS + 3705201460U, // <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1> + 2631460551U, // <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7> + 2297906802U, // <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3> + 2631462198U, // <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS + 3371648547U, // <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5> + 3371648548U, // <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6> + 1224165306U, // <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1224165306U, // <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1215480726U, // <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1679002398U, // <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2> + 1659967368U, // <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3> + 403488870U, // <3,3,u,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 1679002438U, // <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6> + 2756946764U, // <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3> + 1224165306U, // <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 403488870U, // <3,3,u,u>: Cost 1 vspltisw3 LHS + 2691907584U, // <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0> + 1618165862U, // <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631476937U, // <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0> + 2232601732U, // <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0> + 2691907922U, // <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5> + 1158860086U, // <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 3306343806U, // <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7> + 3366947484U, // <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7> + 1618166429U, // <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631483494U, // <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS + 2691908404U, // <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1> + 1618166682U, // <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4> + 3765650393U, // <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4> + 2631486774U, // <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS + 2756946914U, // <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0> + 3765650639U, // <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7> + 3735090439U, // <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1> + 1622148480U, // <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4> + 3765650893U, // <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0> + 3831131154U, // <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3> + 2691909224U, // <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2> + 2691909286U, // <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1> + 2699208469U, // <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4> + 2233863478U, // <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS + 2691909562U, // <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7> + 2701199368U, // <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4> + 2691909691U, // <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1> + 2691909782U, // <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2> + 3765651686U, // <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1> + 2691909972U, // <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3> + 2691910044U, // <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1> + 1161006390U, // <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691910300U, // <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 3368962716U, // <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7> + 1161006633U, // <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS + 2631508070U, // <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS + 2631508890U, // <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4> + 2631509709U, // <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4> + 2289256788U, // <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3> + 1726336208U, // <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4> + 1618169142U, // <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 3362998858U, // <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6> + 2289257116U, // <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7> + 1618169385U, // <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 1557774438U, // <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS + 2631516980U, // <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1> + 1557776078U, // <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5> + 2631518358U, // <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2> + 1557777718U, // <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS + 2296563406U, // <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5> + 604818742U, // <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2661381387U, // <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5> + 604818760U, // <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 3705266278U, // <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS + 3831131482U, // <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7> + 2733715962U, // <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3> + 3844771180U, // <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7> + 2800078197U, // <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7> + 2236550454U, // <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716280U, // <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6> + 2725090156U, // <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4> + 2236550697U, // <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716474U, // <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2> + 3371647013U, // <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1> + 2727744688U, // <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4> + 3371649364U, // <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3> + 2733716838U, // <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6> + 2297906894U, // <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5> + 3371647180U, // <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6> + 2733717100U, // <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7> + 2297906897U, // <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u> + 1557799014U, // <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS + 1618171694U, // <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 1557800657U, // <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u> + 2691913660U, // <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1> + 1557802294U, // <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS + 1618172058U, // <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 604818985U, // <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2661405966U, // <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u> + 604819003U, // <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2643492966U, // <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS + 2756947528U, // <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2> + 2331029019U, // <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2643495062U, // <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2> + 2756947554U, // <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1> + 2800078443U, // <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1> + 2289224194U, // <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6> + 3362964723U, // <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7> + 2756947590U, // <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1> + 2800078479U, // <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1> + 2333027218U, // <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1> + 2691916699U, // <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5> + 3832901294U, // <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5> + 2800078519U, // <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5> + 3830689467U, // <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0> + 3830689481U, // <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5> + 3873820365U, // <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0> + 2800078551U, // <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1> + 3770967487U, // <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4> + 2697225763U, // <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5> + 3830689523U, // <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2> + 2699216590U, // <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5> + 2699216662U, // <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5> + 2783047439U, // <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3> + 2783121176U, // <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3> + 3856936737U, // <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3> + 2701871194U, // <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5> + 2643517542U, // <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS + 2331052946U, // <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1> + 3699345010U, // <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3> + 2705189276U, // <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3> + 2705189359U, // <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5> + 2331053274U, // <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5> + 2295220738U, // <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6> + 3368961267U, // <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7> + 2295220740U, // <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u> + 2643525734U, // <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS + 2331061138U, // <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1> + 2235584280U, // <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3> + 2643528194U, // <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6> + 2735713498U, // <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5> + 2756947892U, // <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6> + 2289256962U, // <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6> + 3362997491U, // <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7> + 2756947919U, // <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6> + 2800078803U, // <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1> + 2800078812U, // <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1> + 2631591639U, // <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5> + 3832901616U, // <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3> + 2800078843U, // <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5> + 1726337028U, // <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078862U, // <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6> + 3368314099U, // <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7> + 1726337028U, // <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078884U, // <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1> + 2800078899U, // <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7> + 2631599832U, // <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6> + 2800078914U, // <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4> + 2800078924U, // <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5> + 2800078935U, // <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7> + 2297235970U, // <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6> + 1726337122U, // <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0> + 1726337131U, // <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0> + 3699376230U, // <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS + 2333739922U, // <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1> + 3699378106U, // <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7> + 3371647915U, // <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3> + 3699379510U, // <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS + 2333740250U, // <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5> + 2297907714U, // <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6> + 3370984691U, // <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7> + 2297907716U, // <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u> + 2800079046U, // <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1> + 2756948176U, // <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2> + 2331029019U, // <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2800079076U, // <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4> + 2800079085U, // <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4> + 1726337028U, // <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2289289730U, // <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6> + 1726337284U, // <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0> + 1726337293U, // <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0> + 3773628416U, // <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0> + 2699886694U, // <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789167401U, // <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1> + 3362965862U, // <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3> + 3773628754U, // <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5> + 3723284326U, // <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0> + 2800079181U, // <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1> + 1215483190U, // <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1215483191U, // <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS + 3873821032U, // <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1> + 3773629236U, // <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1> + 2691924892U, // <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6> + 3830690184U, // <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6> + 3873821072U, // <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5> + 3873821082U, // <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6> + 3403453240U, // <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6> + 2289233206U, // <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 2289233207U, // <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS + 2661498982U, // <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS + 3770975780U, // <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6> + 2631640797U, // <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2> + 3771639485U, // <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6> + 2661502262U, // <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS + 2699888488U, // <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6> + 2661503482U, // <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3> + 1715425786U, // <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3> + 1715499523U, // <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3> + 3773630614U, // <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2> + 3372942825U, // <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1> + 2234749434U, // <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3> + 3368962406U, // <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3> + 2699889154U, // <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6> + 3773631068U, // <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6> + 2331054904U, // <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6> + 1221479734U, // <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 1221479735U, // <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS + 2235584801U, // <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2> + 3717342106U, // <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4> + 2789167729U, // <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5> + 2235585074U, // <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5> + 2235585165U, // <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6> + 2699889974U, // <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 2800079509U, // <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5> + 1215515958U, // <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1215515959U, // <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS + 3873821356U, // <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1> + 3372959209U, // <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1> + 3862909629U, // <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0> + 3773632358U, // <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0> + 3873821396U, // <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5> + 3873821405U, // <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5> + 3862909672U, // <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7> + 2294574390U, // <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 2294574391U, // <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS + 2800079613U, // <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1> + 3873821446U, // <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1> + 2789167888U, // <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2> + 3844920090U, // <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3> + 2800079653U, // <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5> + 3723333484U, // <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6> + 1726337848U, // <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726337858U, // <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7> + 1726337867U, // <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7> + 1726337870U, // <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1> + 2297906665U, // <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1> + 2792117090U, // <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3> + 2297907558U, // <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3> + 1726337910U, // <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5> + 2297906993U, // <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5> + 2297906832U, // <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6> + 1224166710U, // <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224166711U, // <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS + 1726337951U, // <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1> + 2699892526U, // <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789168049U, // <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1> + 2792854460U, // <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3> + 1726337991U, // <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5> + 2699892890U, // <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 1726337848U, // <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1215548726U, // <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 1215548727U, // <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS + 2700558336U, // <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0> + 1626816614U, // <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700558513U, // <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6> + 2331030010U, // <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3> + 2700558674U, // <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5> + 2800079906U, // <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6> + 2655588936U, // <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0> + 2800079919U, // <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1> + 1626817181U, // <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 3774300899U, // <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1> + 2700559156U, // <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1> + 2700559254U, // <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0> + 3774301148U, // <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7> + 3774301227U, // <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5> + 3774301295U, // <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1> + 3768329441U, // <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7> + 3403453250U, // <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7> + 2700559740U, // <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0> + 2700559849U, // <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1> + 3770983973U, // <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7> + 2700559976U, // <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2> + 2698569415U, // <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7> + 2700560177U, // <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5> + 3773638505U, // <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7> + 1626818490U, // <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7> + 2795140307U, // <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3> + 1628145756U, // <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7> + 2700560534U, // <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2> + 3774302438U, // <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1> + 2700560742U, // <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3> + 2700560796U, // <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3> + 2700560898U, // <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6> + 3774302821U, // <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6> + 2700561079U, // <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7> + 2700561091U, // <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1> + 2700561182U, // <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2> + 2655617126U, // <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS + 3774303178U, // <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3> + 2655619002U, // <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7> + 2331062778U, // <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3> + 2655620406U, // <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS + 1626819894U, // <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 2655621708U, // <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4> + 2800080247U, // <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5> + 1626820137U, // <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 3774303816U, // <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2> + 3873822093U, // <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0> + 3774303998U, // <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4> + 3862910368U, // <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1> + 3774304180U, // <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6> + 2800080310U, // <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5> + 2800080321U, // <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7> + 3873822147U, // <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0> + 2800080339U, // <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7> + 2800080348U, // <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7> + 3873822181U, // <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7> + 2789168622U, // <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7> + 2700563016U, // <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0> + 2800080384U, // <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7> + 3862910472U, // <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6> + 2700563256U, // <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6> + 2800080404U, // <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0> + 2793149988U, // <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7> + 2637725798U, // <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS + 3371649227U, // <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1> + 2637727674U, // <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7> + 2297907567U, // <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3> + 2637729078U, // <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS + 3371649312U, // <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5> + 2655646287U, // <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7> + 1726338668U, // <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1726338668U, // <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 2700564179U, // <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2> + 1626822446U, // <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700564357U, // <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0> + 2700564412U, // <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1> + 2700564543U, // <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6> + 1626822810U, // <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 1662654672U, // <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7> + 1726338668U, // <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1626823013U, // <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 1678557184U, // <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1679005395U, // <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2> + 2289221787U, // <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2> + 1215479964U, // <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 2752747245U, // <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1> + 1158863002U, // <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 2289224221U, // <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6> + 1215483208U, // <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1679005458U, // <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2> + 1558036582U, // <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS + 1678558004U, // <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 604821294U, // <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 2752747317U, // <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1> + 1558039862U, // <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS + 2756949830U, // <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0> + 2800080726U, // <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7> + 2289233224U, // <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 604821348U, // <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696586709U, // <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u> + 2757392246U, // <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3> + 1624172151U, // <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u> + 1679005576U, // <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3> + 2631789878U, // <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS + 2699904874U, // <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u> + 1626826683U, // <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u> + 1726338988U, // <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3> + 1683208117U, // <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3> + 1679005628U, // <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1> + 1161008942U, // <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2752747471U, // <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2> + 403488870U, // <3,u,3,3>: Cost 1 vspltisw3 LHS + 1679005668U, // <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5> + 1161009306U, // <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691943104U, // <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7> + 1221479752U, // <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 403488870U, // <3,u,3,u>: Cost 1 vspltisw3 LHS + 2289255363U, // <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0> + 1161844526U, // <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS + 2289256661U, // <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2> + 1215512732U, // <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 1215513498U, // <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679005759U, // <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6> + 2289256989U, // <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6> + 1215515976U, // <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1679005786U, // <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6> + 1558069350U, // <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS + 2631811892U, // <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1> + 1558071026U, // <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5> + 2752747646U, // <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6> + 1558072630U, // <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS + 1726337028U, // <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 604821658U, // <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 2294574408U, // <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 604821676U, // <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 2631819366U, // <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS + 2757392574U, // <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7> + 2631821043U, // <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6> + 1679005904U, // <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 2631822646U, // <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS + 2236553370U, // <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 1726337848U, // <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726339309U, // <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0> + 1683208445U, // <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7> + 1726339328U, // <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1> + 2297905225U, // <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1> + 2631829236U, // <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7> + 1224163484U, // <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 1726339368U, // <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5> + 2297905553U, // <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5> + 2297905392U, // <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6> + 1224166728U, // <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224163489U, // <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1683208529U, // <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1> + 1679006043U, // <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2> + 604821861U, // <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 403488870U, // <3,u,u,3>: Cost 1 vspltisw3 LHS + 1683208569U, // <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5> + 1679006083U, // <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6> + 604821901U, // <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 1215548744U, // <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 604821915U, // <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS + 2759016448U, // <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0> + 1165115494U, // <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS + 3717531337U, // <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0> + 3369675785U, // <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3> + 2751791144U, // <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4> + 2238857630U, // <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0> + 3312591341U, // <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7> + 3369676113U, // <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7> + 1165116061U, // <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS + 2637824102U, // <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS + 2637824922U, // <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4> + 1685274726U, // <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637826512U, // <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1> + 2637827382U, // <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS + 2661716070U, // <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4> + 3729486427U, // <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1> + 2661717300U, // <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1> + 1685274780U, // <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 3711574118U, // <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS + 2240200806U, // <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 3771663992U, // <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0> + 2698585801U, // <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0> + 3373672105U, // <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4> + 3810813795U, // <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1> + 3772327866U, // <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7> + 3386280568U, // <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7> + 2701903966U, // <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0> + 3699638374U, // <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS + 2753560832U, // <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 3772328276U, // <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3> + 3827302674U, // <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4> + 3699641654U, // <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS + 3779627588U, // <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0> + 3772328604U, // <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7> + 3780954854U, // <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0> + 2753560832U, // <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 2725129106U, // <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1> + 1167720550U, // <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 3839172953U, // <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3> + 3772329051U, // <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4> + 2241462610U, // <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5> + 2698587446U, // <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 3772329297U, // <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7> + 3735483703U, // <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4> + 1167721117U, // <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS + 1168556032U, // <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 94814310U, // <4,0,5,1>: Cost 1 vmrghw RHS, LHS + 2242298029U, // <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2637859284U, // <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5> + 1168556370U, // <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2242306530U, // <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5> + 2242298358U, // <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661750072U, // <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5> + 94814877U, // <4,0,5,u>: Cost 1 vmrghw RHS, LHS + 3316580362U, // <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1> + 2242846822U, // <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS + 3798872570U, // <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3> + 3796218413U, // <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0> + 3834528273U, // <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7> + 3798872811U, // <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1> + 3316621876U, // <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6> + 2725131121U, // <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0> + 2242847389U, // <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS + 3377692672U, // <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0> + 2243493990U, // <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 3775648970U, // <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3> + 3802191110U, // <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0> + 3317236050U, // <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5> + 3803518376U, // <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0> + 3317236214U, // <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7> + 3798873708U, // <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7> + 2243494557U, // <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS + 1170546688U, // <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 96804966U, // <4,0,u,1>: Cost 1 vmrghw RHS, LHS + 1685275293U, // <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637883863U, // <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u> + 1170547026U, // <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2698590362U, // <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 2244289014U, // <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661774651U, // <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u> + 96805533U, // <4,0,u,u>: Cost 1 vmrghw RHS, LHS + 2667749478U, // <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS + 2689966182U, // <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS + 2238571418U, // <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4> + 3711633880U, // <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0> + 2689966418U, // <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5> + 3361046866U, // <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5> + 3741495802U, // <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3> + 3741496314U, // <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2> + 2689966765U, // <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1> + 3764372222U, // <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1> + 2758206263U, // <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4> + 2698593178U, // <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4> + 3361057810U, // <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3> + 3827303250U, // <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4> + 2287313234U, // <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5> + 3763709171U, // <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7> + 3361058138U, // <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7> + 2239759744U, // <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4> + 2637906022U, // <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS + 2637906842U, // <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4> + 3763709544U, // <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2> + 1685275546U, // <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4> + 2637909302U, // <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS + 3361063250U, // <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5> + 3763709882U, // <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7> + 3735541054U, // <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2> + 1685644231U, // <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4> + 2702575792U, // <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1> + 3832759257U, // <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4> + 3833349090U, // <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4> + 3763710364U, // <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3> + 2707884546U, // <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6> + 3361071442U, // <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5> + 3772336796U, // <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7> + 3775654595U, // <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1> + 2707884856U, // <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1> + 2667782246U, // <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS + 2241463092U, // <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1> + 2241553306U, // <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4> + 3827303484U, // <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4> + 2667785424U, // <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4> + 2689969462U, // <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 3763711322U, // <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7> + 3867116636U, // <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0> + 2689969705U, // <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 1546273106U, // <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5> + 1168556852U, // <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1168556950U, // <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2620016790U, // <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2> + 1546276150U, // <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS + 2620018692U, // <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5> + 2242299087U, // <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667795450U, // <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2> + 1546278702U, // <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS + 3781628193U, // <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2> + 3832759503U, // <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7> + 3316261786U, // <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4> + 3781628466U, // <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5> + 3827303658U, // <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7> + 3361096018U, // <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5> + 3788264248U, // <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6> + 3788264270U, // <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1> + 3832759566U, // <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7> + 2726466580U, // <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1> + 3377692682U, // <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1> + 3377694870U, // <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2> + 3802199303U, // <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1> + 2731775334U, // <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6> + 3377693010U, // <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5> + 3365749804U, // <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6> + 3788265068U, // <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7> + 2731775644U, // <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1> + 1546297685U, // <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u> + 1170547508U, // <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1170547606U, // <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 1689257344U, // <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4> + 1546300726U, // <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS + 2284716370U, // <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5> + 2244289743U, // <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667820026U, // <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2> + 1546303278U, // <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS + 3729621094U, // <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS + 3763716198U, // <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS + 2238858856U, // <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2> + 2295930982U, // <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 3763716434U, // <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5> + 2238859107U, // <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1> + 2238859194U, // <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7> + 3312601066U, // <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1> + 2295930987U, // <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS + 3699769446U, // <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS + 3313255971U, // <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5> + 3361056360U, // <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2> + 2287312998U, // <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 3788932148U, // <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5> + 3313256290U, // <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0> + 3838289469U, // <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4> + 3369682865U, // <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7> + 2287313003U, // <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS + 3838658133U, // <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1> + 3711722394U, // <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4> + 2759018088U, // <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2> + 2759018098U, // <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3> + 3838658168U, // <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0> + 3369027341U, // <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5> + 2240227258U, // <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7> + 3735614791U, // <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2> + 2759018143U, // <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3> + 2759018150U, // <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1> + 3831948975U, // <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1> + 3832759993U, // <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2> + 2759018180U, // <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4> + 2759018185U, // <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0> + 3839542998U, // <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4> + 3314640826U, // <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7> + 2765948648U, // <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4> + 2759018222U, // <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1> + 3838658295U, // <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1> + 3315205667U, // <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5> + 2241463912U, // <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2> + 1234829414U, // <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 2241464085U, // <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4> + 2241546087U, // <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5> + 2241464250U, // <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7> + 3741602873U, // <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2> + 1234829419U, // <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS + 2626060390U, // <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS + 2626061364U, // <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5> + 1168557672U, // <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222230118U, // <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 2626063670U, // <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS + 2242299752U, // <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1168558010U, // <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2242299882U, // <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1222230123U, // <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS + 3711754342U, // <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS + 3711755162U, // <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4> + 3838658481U, // <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7> + 2759018426U, // <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7> + 3838658499U, // <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7> + 3735646310U, // <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4> + 3316590522U, // <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7> + 3798889331U, // <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2> + 2759018471U, // <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7> + 3874564074U, // <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1> + 3800880230U, // <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2> + 3371722344U, // <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2> + 2303950950U, // <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 3371722346U, // <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4> + 3371722509U, // <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5> + 3317237690U, // <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7> + 3317237738U, // <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1> + 2303950955U, // <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 2759018555U, // <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1> + 2626085943U, // <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u> + 1170548328U, // <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222254694U, // <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 2759018595U, // <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5> + 2244290408U, // <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1170548666U, // <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2769266813U, // <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4> + 1222254699U, // <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS + 2238859414U, // <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2> + 2759018646U, // <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2> + 3312314708U, // <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3> + 2238859676U, // <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3> + 2295931802U, // <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4> + 3735670886U, // <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4> + 3312315036U, // <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7> + 3369674682U, // <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7> + 2759018709U, // <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2> + 3361055638U, // <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0> + 3831949542U, // <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1> + 2703917978U, // <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3361056370U, // <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3> + 2295939994U, // <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4> + 3361056291U, // <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5> + 3378972520U, // <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6> + 3361056698U, // <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7> + 2703917978U, // <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3832760624U, // <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3> + 3711796122U, // <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4> + 3832760641U, // <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2> + 2770962764U, // <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4> + 2759018836U, // <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3> + 3827304802U, // <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u> + 3832760678U, // <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3> + 3859597679U, // <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3> + 2771331449U, // <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4> + 2240841878U, // <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2> + 3776997635U, // <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3> + 2703919444U, // <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3> + 2759018908U, // <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3> + 2759018918U, // <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4> + 3386951446U, // <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5> + 3777661596U, // <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7> + 3375007674U, // <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7> + 2707901242U, // <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3> + 2759018960U, // <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1> + 2759018970U, // <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2> + 2632099605U, // <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4> + 2241464732U, // <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3> + 2759019000U, // <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5> + 2753563138U, // <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6> + 3777662316U, // <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7> + 2308573114U, // <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7> + 2759019032U, // <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1> + 1168558230U, // <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2242300134U, // <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1> + 2632107798U, // <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5> + 1168558492U, // <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1168558594U, // <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2295973654U, // <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5> + 2242300536U, // <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295973818U, // <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7> + 1168558878U, // <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 3832760952U, // <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7> + 3711828890U, // <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4> + 3316484436U, // <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3> + 3711830512U, // <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6> + 2759019164U, // <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3361097251U, // <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5> + 3316624045U, // <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6> + 2773912244U, // <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4> + 2759019164U, // <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3377693590U, // <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0> + 3365751680U, // <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1> + 2727810232U, // <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3> + 3377694322U, // <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3> + 2303951770U, // <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4> + 3741700198U, // <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4> + 3377695216U, // <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6> + 3375703994U, // <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7> + 2731792030U, // <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3> + 1170548886U, // <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2759019294U, // <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2> + 2632132377U, // <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u> + 1170549148U, // <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1170549250U, // <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2759019334U, // <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6> + 2244291192U, // <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295998394U, // <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7> + 1170549534U, // <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 1165118354U, // <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1637482598U, // <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 3711854285U, // <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4> + 3827305344U, // <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1> + 2711224658U, // <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5> + 1165118774U, // <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 3312602489U, // <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2> + 3369675420U, // <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7> + 1165119017U, // <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS + 3369682633U, // <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0> + 2287313581U, // <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1> + 2759019466U, // <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3> + 3369683284U, // <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3> + 2311204048U, // <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4> + 2239319350U, // <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS + 3784967411U, // <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7> + 3369683612U, // <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7> + 2763000832U, // <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3> + 3711869030U, // <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS + 3711869850U, // <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4> + 2240203830U, // <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3> + 2698618573U, // <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4> + 2711226133U, // <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4> + 2240204086U, // <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2711226298U, // <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7> + 3832761416U, // <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3> + 2701936738U, // <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4> + 2711226518U, // <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2> + 3777005828U, // <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4> + 3832761453U, // <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4> + 2301266260U, // <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 2705254903U, // <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4> + 2240843062U, // <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 3832761489U, // <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4> + 3375008412U, // <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7> + 2301266260U, // <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 1570373734U, // <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS + 2308574089U, // <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1> + 2644117096U, // <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2> + 2638146039U, // <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4> + 229035318U, // <4,4,4,4>: Cost 1 vspltisw0 RHS + 1167723830U, // <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS + 2644120058U, // <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3> + 2662036827U, // <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4> + 229035318U, // <4,4,4,u>: Cost 1 vspltisw0 RHS + 1168558994U, // <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 2638152602U, // <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4> + 2242300981U, // <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638154232U, // <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5> + 1168559322U, // <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5> + 94817590U, // <4,4,5,5>: Cost 1 vmrghw RHS, RHS + 1685278006U, // <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2242309576U, // <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 94817833U, // <4,4,5,u>: Cost 1 vmrghw RHS, RHS + 3316591506U, // <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1> + 3758428587U, // <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5> + 2711228922U, // <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3> + 3796251185U, // <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4> + 2711229085U, // <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4> + 2242850102U, // <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2242850169U, // <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2> + 2725163893U, // <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4> + 2242850345U, // <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS + 2711229434U, // <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2> + 3377694410U, // <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1> + 3868593584U, // <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3> + 3377695060U, // <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3> + 2729145691U, // <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4> + 2243497270U, // <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 3871542744U, // <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7> + 2303953564U, // <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7> + 2243497513U, // <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS + 1170549650U, // <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 1637488430U, // <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 2244291637U, // <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638178811U, // <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u> + 229035318U, // <4,4,u,4>: Cost 1 vspltisw0 RHS + 96808246U, // <4,4,u,5>: Cost 1 vmrghw RHS, RHS + 1685278249U, // <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2244292040U, // <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 96808489U, // <4,4,u,u>: Cost 1 vmrghw RHS, RHS + 2698625024U, // <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0> + 1624883302U, // <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2638186190U, // <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5> + 2638187004U, // <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0> + 2687345005U, // <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5> + 2238861316U, // <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5> + 2662077302U, // <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5> + 2662077792U, // <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0> + 1624883869U, // <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 3361057762U, // <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0> + 2691326803U, // <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5> + 2698625942U, // <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0> + 3361055659U, // <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3> + 3761087567U, // <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5> + 2693981335U, // <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5> + 2305231362U, // <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 3361055987U, // <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7> + 2695972234U, // <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5> + 2638200934U, // <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS + 3761088035U, // <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5> + 2697963133U, // <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5> + 1624884942U, // <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5> + 2698626838U, // <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5> + 3772368744U, // <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6> + 2698627002U, // <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7> + 3775023122U, // <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5> + 1628203107U, // <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5> + 2698627222U, // <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2> + 3765070057U, // <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4> + 2698627404U, // <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4> + 2698627484U, // <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3> + 2698627580U, // <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0> + 3779668553U, // <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5> + 2725169844U, // <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4> + 2707253995U, // <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5> + 2698627870U, // <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2> + 2638217318U, // <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS + 2308574098U, // <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1> + 2698628150U, // <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3> + 2638219776U, // <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4> + 2698628314U, // <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5> + 1624886582U, // <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 2698628478U, // <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7> + 2662110564U, // <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4> + 1624886825U, // <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1570455654U, // <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS + 2312564250U, // <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1> + 2644199118U, // <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5> + 2295974966U, // <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3> + 1570458842U, // <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5> + 1168568324U, // <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5> + 1168568418U, // <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2295975294U, // <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7> + 1168716036U, // <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0> + 1564491878U, // <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS + 2626290768U, // <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6> + 2632263465U, // <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6> + 1564494338U, // <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6> + 1564495158U, // <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS + 2638237464U, // <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3> + 2656154253U, // <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2725172218U, // <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2> + 3859599489U, // <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4> + 2698630320U, // <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4> + 2728490251U, // <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5> + 2725172576U, // <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0> + 3317239812U, // <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5> + 2725172760U, // <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4> + 2725172844U, // <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7> + 2725172866U, // <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2> + 1564508262U, // <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS + 1624889134U, // <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2698631045U, // <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0> + 1564510724U, // <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u> + 1564511542U, // <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS + 1624889498U, // <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1170550882U, // <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 3312595285U, // <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0> + 3763748966U, // <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS + 2238861818U, // <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3> + 3767730432U, // <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4> + 3763749202U, // <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5> + 2238862059U, // <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1> + 2238862136U, // <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6> + 2295934262U, // <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 2295934263U, // <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS + 3378973999U, // <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0> + 3378974648U, // <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1> + 3779675034U, // <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4> + 3378974002U, // <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3> + 3378974003U, // <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4> + 3767731352U, // <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6> + 3378974734U, // <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6> + 2287316278U, // <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 2287316279U, // <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS + 3735904358U, // <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS + 3763750435U, // <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5> + 3313938937U, // <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2> + 3772376782U, // <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5> + 3852890591U, // <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3> + 3735908454U, // <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4> + 3801573306U, // <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7> + 2785858042U, // <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3> + 2785858051U, // <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3> + 3863065101U, // <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4> + 3314586024U, // <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2> + 3863212575U, // <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4> + 3863286312U, // <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4> + 3767732738U, // <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6> + 3779676746U, // <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6> + 3398898488U, // <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6> + 2301267254U, // <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2301267255U, // <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS + 3852890715U, // <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1> + 3315208615U, // <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1> + 2241466874U, // <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3> + 3852890745U, // <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4> + 2241467037U, // <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4> + 2241549039U, // <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5> + 2241467192U, // <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6> + 1234832694U, // <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 1234832695U, // <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS + 2242302241U, // <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2242310567U, // <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1168568826U, // <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2242302514U, // <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2242302605U, // <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2242310891U, // <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1168569144U, // <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222233398U, // <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 1222233399U, // <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS + 3316576545U, // <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2> + 3316584871U, // <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1> + 2242851322U, // <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3> + 3316601394U, // <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5> + 3852890916U, // <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4> + 3316617963U, // <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1> + 2242884408U, // <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6> + 2785858370U, // <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7> + 2785858379U, // <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7> + 2785858382U, // <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1> + 3859600215U, // <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1> + 3317240314U, // <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3> + 2792199020U, // <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4> + 2785858422U, // <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5> + 3856651132U, // <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2> + 3317240632U, // <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6> + 2303954230U, // <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303954231U, // <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS + 2244292897U, // <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2244293031U, // <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1170551290U, // <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2244293170U, // <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2244293261U, // <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2244293355U, // <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1170551608U, // <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222257974U, // <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS + 1222257975U, // <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS + 2238862330U, // <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2> + 2706604134U, // <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3312604308U, // <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3> + 3768402176U, // <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4> + 2238862648U, // <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5> + 3859600418U, // <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6> + 3729994393U, // <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0> + 2238862956U, // <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7> + 2706604701U, // <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3385610338U, // <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0> + 3780346676U, // <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1> + 2706604954U, // <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3385610746U, // <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3> + 3385610342U, // <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4> + 3385610667U, // <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5> + 3768403178U, // <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7> + 3385611074U, // <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7> + 2706604954U, // <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3859600532U, // <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3> + 3712091034U, // <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4> + 3774375528U, // <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2> + 2794853552U, // <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4> + 2785858744U, // <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3> + 3735982182U, // <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4> + 3774375875U, // <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7> + 3735983476U, // <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2> + 2795222237U, // <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4> + 3780348054U, // <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2> + 3730015130U, // <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4> + 3780348244U, // <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3> + 3778357673U, // <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7> + 2325155942U, // <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4> + 3779684939U, // <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7> + 2706606748U, // <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7> + 3398898498U, // <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7> + 2707934014U, // <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7> + 2785858868U, // <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1> + 3780348874U, // <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3> + 3780349000U, // <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3> + 2308575738U, // <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3> + 2656283856U, // <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4> + 2706607414U, // <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2656285341U, // <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4> + 2241468012U, // <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7> + 2706607657U, // <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 1168569338U, // <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2242311242U, // <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1> + 2242303178U, // <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3> + 2242311395U, // <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1168569702U, // <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2242311606U, // <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5> + 2242311662U, // <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1168569964U, // <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1168569986U, // <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 3316593658U, // <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2> + 3316593738U, // <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1> + 3316634800U, // <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4> + 3386978810U, // <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3> + 2785859072U, // <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7> + 3736014950U, // <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4> + 3316594158U, // <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7> + 2797803032U, // <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4> + 2797876769U, // <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4> + 2243499002U, // <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2> + 3718103962U, // <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4> + 3317257418U, // <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3> + 3377695816U, // <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3> + 2243532134U, // <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6> + 3317282230U, // <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5> + 2730497536U, // <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7> + 2243556972U, // <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7> + 2243565186U, // <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2> + 1170551802U, // <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2706609966U, // <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 2244293797U, // <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2> + 2244293859U, // <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1170552166U, // <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2706610330U, // <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2244294126U, // <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1170552428U, // <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1170552450U, // <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 1165118354U, // <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1624907878U, // <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638407377U, // <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u> + 2295931036U, // <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 2687369584U, // <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u> + 1165121690U, // <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 2662298489U, // <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u> + 2295934280U, // <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 1624908445U, // <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638413926U, // <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS + 2691351382U, // <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u> + 1685280558U, // <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2287313052U, // <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 2299257799U, // <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4> + 2694005914U, // <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u> + 2305231362U, // <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 2287316296U, // <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 1685280612U, // <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2638422118U, // <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS + 2240206638U, // <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 2697987712U, // <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u> + 1624909521U, // <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u> + 2759391121U, // <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3> + 2240207002U, // <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2698651578U, // <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7> + 2785859500U, // <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3> + 1628227686U, // <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u> + 2759022524U, // <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1> + 2801342408U, // <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4> + 2703960409U, // <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u> + 2759022554U, // <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4> + 2759022564U, // <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5> + 2240845978U, // <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 2706614941U, // <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u> + 2301267272U, // <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2759022596U, // <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1> + 1570668646U, // <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS + 1167726382U, // <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 2698652753U, // <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3> + 1234829468U, // <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 229035318U, // <4,u,4,4>: Cost 1 vspltisw0 RHS + 1624911158U, // <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS + 2698653081U, // <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7> + 1234832712U, // <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 229035318U, // <4,u,4,u>: Cost 1 vspltisw0 RHS + 1168561875U, // <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2> + 94820142U, // <4,u,5,1>: Cost 1 vmrghw RHS, LHS + 1168562053U, // <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0> + 1222230172U, // <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 1168562239U, // <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6> + 94820506U, // <4,u,5,5>: Cost 1 vmrghw RHS, RHS + 1685280922U, // <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 1222233416U, // <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 94820709U, // <4,u,5,u>: Cost 1 vmrghw RHS, LHS + 1564713062U, // <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS + 2626511979U, // <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6> + 2632484676U, // <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6> + 1564715549U, // <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6> + 1564716342U, // <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS + 2242853018U, // <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2656375464U, // <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6> + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2785859840U, // <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1> + 2243499822U, // <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 2727851197U, // <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u> + 2303951004U, // <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 2785859880U, // <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5> + 2243500186U, // <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 2730505729U, // <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u> + 2303954248U, // <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303951009U, // <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 1564729446U, // <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS + 96810798U, // <4,u,u,1>: Cost 1 vmrghw RHS, LHS + 1685281125U, // <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 1222254748U, // <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 229035318U, // <4,u,u,4>: Cost 1 vspltisw0 RHS + 96811162U, // <4,u,u,5>: Cost 1 vmrghw RHS, RHS + 1685281165U, // <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2754232320U, // <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0> + 2754232330U, // <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1> + 3718194894U, // <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5> + 3376385762U, // <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3> + 2754232357U, // <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1> + 3845816370U, // <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5> + 3782353389U, // <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7> + 3376386090U, // <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7> + 2757402697U, // <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1> + 2626543718U, // <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS + 2626544751U, // <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1> + 1680490598U, // <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3766428665U, // <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0> + 2626546998U, // <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS + 2650435539U, // <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1> + 3783017715U, // <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7> + 3385019000U, // <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7> + 1680490652U, // <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3376398336U, // <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0> + 2245877862U, // <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 3773064808U, // <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2> + 2705295054U, // <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 3827974343U, // <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1> + 3845816530U, // <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3> + 3779037114U, // <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7> + 3810887658U, // <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1> + 2245878429U, // <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS + 2710603926U, // <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2> + 3827974396U, // <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0> + 3779037516U, // <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4> + 3779037596U, // <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3> + 2705295868U, // <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0> + 3379726804U, // <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5> + 3802925748U, // <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4> + 3363138168U, // <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7> + 2707950400U, // <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0> + 2626568294U, // <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS + 1680490834U, // <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 3828048219U, // <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5> + 2710604932U, // <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0> + 2754232685U, // <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5> + 2705296694U, // <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779038590U, // <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7> + 2713259464U, // <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1680490834U, // <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 2311307264U, // <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0> + 1174437990U, // <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 3779038946U, // <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3> + 3845816752U, // <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0> + 2248180050U, // <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5> + 2248180194U, // <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5> + 3779039274U, // <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7> + 3385051768U, // <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7> + 1174438557U, // <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS + 2302689280U, // <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0> + 1175208038U, // <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 3787002362U, // <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3> + 3376432160U, // <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3> + 2248950098U, // <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5> + 2248950180U, // <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 3376433702U, // <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6> + 2729186166U, // <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5> + 1175208605U, // <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS + 2713261050U, // <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2> + 3365823599U, // <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1> + 3808900317U, // <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4> + 3784348899U, // <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1> + 2729186656U, // <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0> + 3787003268U, // <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0> + 3802928664U, // <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4> + 3787003431U, // <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1> + 2731841188U, // <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0> + 2626601062U, // <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS + 1683145366U, // <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5> + 1680491165U, // <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2705295054U, // <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 2754233005U, // <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1> + 2705299610U, // <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779041488U, // <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7> + 2737150252U, // <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0> + 1680491219U, // <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2713927680U, // <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0> + 1640185958U, // <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2310607866U, // <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 3787669756U, // <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0> + 2713928018U, // <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5> + 2306621778U, // <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5> + 3787670006U, // <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7> + 3736188301U, // <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0> + 1640186525U, // <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2650505318U, // <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS + 2754233140U, // <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1> + 2311276694U, // <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2> + 2311278315U, // <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3> + 2758435667U, // <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5> + 2754233180U, // <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5> + 3385016497U, // <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6> + 2311278643U, // <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7> + 2758730615U, // <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5> + 3700367462U, // <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS + 3830629255U, // <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3> + 2713929320U, // <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2> + 2754233238U, // <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0> + 2759099300U, // <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5> + 2754233259U, // <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3> + 2713929658U, // <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7> + 3872359354U, // <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0> + 2754233283U, // <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0> + 2713929878U, // <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2> + 3363135498U, // <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1> + 3363137686U, // <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2> + 2713930140U, // <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3> + 2713930242U, // <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6> + 2289394002U, // <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5> + 3787672184U, // <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7> + 3787672259U, // <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1> + 2713930526U, // <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2> + 1634880402U, // <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1> + 2760205355U, // <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5> + 2760279092U, // <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5> + 3787672708U, // <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0> + 2713930960U, // <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4> + 1640189238U, // <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 3786345848U, // <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1> + 3787009481U, // <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1> + 1640189466U, // <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1> + 2754233455U, // <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1> + 2713931407U, // <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1> + 2713931499U, // <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3> + 3827975305U, // <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0> + 2754233495U, // <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5> + 2288746834U, // <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5> + 2713931827U, // <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7> + 3787673725U, // <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0> + 2754233527U, // <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1> + 2668462182U, // <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS + 2290746002U, // <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1> + 2302691478U, // <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2> + 3364488071U, // <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3> + 2302689536U, // <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4> + 2754233587U, // <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7> + 2713932600U, // <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6> + 2713932622U, // <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1> + 2302689297U, // <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u> + 2713932794U, // <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2> + 3365822474U, // <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1> + 3365824662U, // <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2> + 3787674851U, // <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1> + 2713933158U, // <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6> + 2292080978U, // <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5> + 3365823613U, // <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6> + 2713933420U, // <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7> + 2713933442U, // <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2> + 1658771190U, // <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1> + 1640191790U, // <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2762933624U, // <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5> + 2754233724U, // <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0> + 2763081098U, // <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5> + 1640192154U, // <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 2713934032U, // <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7> + 2713934080U, // <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1> + 1640192357U, // <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 3779051520U, // <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0> + 2705309798U, // <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 3838813637U, // <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1> + 2302640230U, // <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 3765117266U, // <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5> + 3381027892U, // <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5> + 3842794985U, // <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1> + 3408232554U, // <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7> + 2302640235U, // <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS + 3700432998U, // <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS + 3765117785U, // <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2> + 2311276136U, // <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2> + 1237532774U, // <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700436278U, // <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS + 3381036084U, // <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5> + 3385018045U, // <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6> + 3385017560U, // <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7> + 1237532779U, // <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700441190U, // <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS + 3700442242U, // <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2> + 2754233960U, // <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2> + 2754233970U, // <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3> + 2765071997U, // <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5> + 3834021508U, // <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3> + 3842795152U, // <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6> + 3376402492U, // <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7> + 2754234015U, // <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3> + 2754234022U, // <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1> + 3827975855U, // <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1> + 2644625102U, // <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393766U, // <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1691993806U, // <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5> + 2785052375U, // <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5> + 3854812897U, // <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6> + 3802942187U, // <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5> + 1692288754U, // <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5> + 3839846139U, // <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5> + 2709294052U, // <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2> + 2766251789U, // <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5> + 2765735702U, // <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5> + 3840141087U, // <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5> + 2705313078U, // <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2712612217U, // <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2> + 3787017674U, // <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2> + 2765735747U, // <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5> + 3834021704U, // <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1> + 3834021714U, // <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2> + 2311308904U, // <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2> + 1237565542U, // <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 3834021744U, // <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5> + 3369124916U, // <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5> + 2248181690U, // <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7> + 3786354825U, // <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3> + 1237565547U, // <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS + 3700473958U, // <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS + 3700475014U, // <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6> + 2296718952U, // <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2> + 1228947558U, // <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 3700477238U, // <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS + 3834021836U, // <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7> + 2248951738U, // <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7> + 3370461105U, // <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7> + 1228947563U, // <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 3786355706U, // <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2> + 3783038037U, // <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3> + 3365824104U, // <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2> + 2292080742U, // <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS + 3842131986U, // <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5> + 3371795508U, // <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5> + 3786356206U, // <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7> + 3786356332U, // <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7> + 2292080747U, // <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS + 2754234427U, // <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1> + 2705315630U, // <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 2296735336U, // <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2> + 1228963942U, // <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 1695311971U, // <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5> + 2705315994U, // <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2769201269U, // <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5> + 3370477489U, // <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7> + 1695606919U, // <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5> + 3827976331U, // <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0> + 2754234518U, // <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2> + 3706472290U, // <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0> + 3700500630U, // <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2> + 2754234544U, // <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1> + 3376383766U, // <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5> + 3769770513U, // <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7> + 3376383930U, // <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7> + 2754234581U, // <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2> + 2311275414U, // <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0> + 2305967971U, // <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1> + 2692047787U, // <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3> + 2311276146U, // <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3> + 2311275418U, // <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4> + 3765789807U, // <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1> + 3765789939U, // <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7> + 2311276474U, // <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7> + 2696029585U, // <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3> + 2311288709U, // <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 3765790243U, // <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5> + 3827976513U, // <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2> + 2765736268U, // <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4> + 2246248962U, // <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6> + 3765790563U, // <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1> + 3827976550U, // <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3> + 3842795887U, // <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3> + 2769054073U, // <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4> + 3827976575U, // <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1> + 3765790963U, // <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5> + 3839478162U, // <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2> + 2754234780U, // <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3> + 2771708327U, // <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5> + 3363137059U, // <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5> + 3375081320U, // <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6> + 3363137466U, // <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7> + 2772003275U, // <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5> + 2772077012U, // <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5> + 3765791714U, // <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0> + 2709965878U, // <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3> + 2772298223U, // <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5> + 2772371960U, // <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5> + 2754234882U, // <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6> + 3839478282U, // <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5> + 3376416698U, // <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7> + 2754234909U, // <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6> + 2311308182U, // <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0> + 3765792421U, // <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5> + 2715938575U, // <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3> + 2311308914U, // <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3> + 2311308186U, // <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4> + 2248182354U, // <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5> + 3765792837U, // <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7> + 2311309242U, // <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7> + 2311308190U, // <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u> + 2632777830U, // <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3706520372U, // <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1> + 2632779624U, // <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6> + 2632780290U, // <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6> + 2632781110U, // <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS + 2248952413U, // <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7> + 2302691176U, // <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302691258U, // <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7> + 2632783662U, // <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3365823382U, // <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0> + 3706529011U, // <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7> + 3706529641U, // <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7> + 3365824114U, // <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3> + 2774362859U, // <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5> + 3365824035U, // <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5> + 3383740183U, // <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6> + 3363833786U, // <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7> + 2774657807U, // <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5> + 2632794214U, // <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS + 2754235166U, // <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2> + 2632796010U, // <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u> + 2632796676U, // <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u> + 2632797494U, // <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS + 2754235206U, // <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6> + 2302691176U, // <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302707642U, // <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7> + 2754235229U, // <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2> + 3765133325U, // <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4> + 2705326182U, // <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 3718489806U, // <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5> + 3718490624U, // <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4> + 2709307730U, // <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5> + 2302641870U, // <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5> + 3376383695U, // <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6> + 3384351018U, // <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7> + 2705326749U, // <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 2305971057U, // <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0> + 3765134171U, // <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4> + 3766461338U, // <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4> + 3766461437U, // <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4> + 2311277776U, // <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4> + 2754235362U, // <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0> + 3783050483U, // <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7> + 3385019036U, // <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7> + 2311276241U, // <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u> + 3718504550U, // <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS + 3783050787U, // <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5> + 3773097576U, // <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2> + 2705327822U, // <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 3773097767U, // <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4> + 2765737014U, // <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3> + 3779069882U, // <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7> + 3376401052U, // <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7> + 2245881370U, // <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1> + 3779070102U, // <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2> + 3363135525U, // <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1> + 3779070284U, // <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4> + 3779070364U, // <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3> + 2705328640U, // <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4> + 2307311310U, // <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5> + 3866021012U, // <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7> + 3363138204U, // <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7> + 2707983172U, // <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4> + 2708646805U, // <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4> + 2709310438U, // <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4> + 3779071030U, // <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3> + 2710637704U, // <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4> + 2754235600U, // <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4> + 1704676570U, // <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5> + 3779071358U, // <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7> + 2713292236U, // <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4> + 1704897781U, // <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5> + 2626871398U, // <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS + 2626872471U, // <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5> + 2765737230U, // <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3> + 3700615318U, // <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2> + 2626874678U, // <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS + 1174441270U, // <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS + 1680493878U, // <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 3385051804U, // <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7> + 1680493896U, // <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2248952722U, // <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1> + 2302692152U, // <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 3382406107U, // <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2> + 3700623874U, // <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6> + 2248953040U, // <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4> + 1175211318U, // <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 3376432280U, // <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6> + 2729218934U, // <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5> + 1175211561U, // <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS + 3787035642U, // <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2> + 3365822501U, // <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1> + 3808933085U, // <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4> + 3784381707U, // <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5> + 2713294182U, // <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6> + 2309998286U, // <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5> + 3383740111U, // <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6> + 3787036239U, // <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5> + 2731873960U, // <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4> + 2626895974U, // <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS + 2626897050U, // <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u> + 2644813518U, // <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5> + 2705327822U, // <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 2626899254U, // <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS + 1707331102U, // <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5> + 1680494121U, // <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2737183024U, // <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4> + 1680494139U, // <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2302642684U, // <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0> + 1640218726U, // <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 3376384510U, // <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2> + 3376385078U, // <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3> + 2754236002U, // <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1> + 2717942242U, // <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5> + 2244907106U, // <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 3376385406U, // <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7> + 1640219293U, // <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2305969365U, // <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0> + 1237536282U, // <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2713961366U, // <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0> + 3766469630U, // <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5> + 2782326455U, // <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5> + 2311277786U, // <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5> + 2311277058U, // <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6> + 3385017587U, // <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7> + 1237536282U, // <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 3376400892U, // <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0> + 3827977963U, // <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3> + 2302659070U, // <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2> + 2765737726U, // <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4> + 3839479558U, // <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3> + 2781073167U, // <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3> + 2713962426U, // <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7> + 3376401790U, // <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7> + 2769055531U, // <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4> + 2713962646U, // <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2> + 3765143786U, // <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5> + 3839479621U, // <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3> + 2289394603U, // <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3> + 2713963010U, // <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6> + 2313285150U, // <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5> + 3363138050U, // <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6> + 3363136755U, // <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7> + 2713963294U, // <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2> + 2713963410U, // <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1> + 3827978127U, // <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5> + 3839479704U, // <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5> + 3376417846U, // <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3> + 1637567706U, // <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5> + 1640222006U, // <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS + 2310640998U, // <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6> + 3376418174U, // <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7> + 1640222238U, // <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5> + 1577091174U, // <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 2311310226U, // <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1> + 2713964303U, // <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3> + 2311311119U, // <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3> + 1577094454U, // <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,5,5>: Cost 1 vspltisw1 RHS + 2311309826U, // <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6> + 2311311447U, // <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7> + 296144182U, // <5,5,5,u>: Cost 1 vspltisw1 RHS + 2248953460U, // <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1> + 2326580114U, // <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1> + 2713965050U, // <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3> + 3700697602U, // <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6> + 2785644620U, // <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5> + 2781073495U, // <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7> + 1228950018U, // <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965390U, // <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1> + 1228950018U, // <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965562U, // <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2> + 3383741330U, // <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1> + 3718620878U, // <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5> + 3365823403U, // <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3> + 2713965926U, // <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6> + 2717947318U, // <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5> + 3365825026U, // <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6> + 2292081907U, // <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7> + 2713966210U, // <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2> + 1577091174U, // <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1640224558U, // <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2713966469U, // <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0> + 2713966524U, // <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1> + 1577094454U, // <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,u,5>: Cost 1 vspltisw1 RHS + 1228950018U, // <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713966848U, // <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1> + 296144182U, // <5,5,u,u>: Cost 1 vspltisw1 RHS + 2705342464U, // <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0> + 1631600742U, // <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3773112493U, // <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2> + 2705342720U, // <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4> + 2705342802U, // <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5> + 3779084708U, // <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6> + 3779084790U, // <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7> + 2302643510U, // <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631601309U, // <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3767141092U, // <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2> + 2705343284U, // <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1> + 2705343382U, // <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0> + 3779085282U, // <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4> + 2693399632U, // <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6> + 3767805089U, // <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6> + 2311279416U, // <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6> + 1237536054U, // <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1237536055U, // <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS + 3773113789U, // <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2> + 3779085855U, // <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1> + 2699372136U, // <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2> + 2705344166U, // <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1> + 2699372329U, // <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6> + 2705344360U, // <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6> + 2705344442U, // <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7> + 2302659894U, // <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2702026861U, // <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6> + 2705344662U, // <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2> + 3767142661U, // <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5> + 3773114689U, // <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2> + 2705344924U, // <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3> + 1631603202U, // <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6> + 3842945597U, // <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7> + 3779086962U, // <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1> + 2289397046U, // <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634257734U, // <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6> + 2644926566U, // <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS + 3779087306U, // <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3> + 2790142577U, // <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5> + 2644929026U, // <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6> + 2711317723U, // <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6> + 1631604022U, // <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 2712644989U, // <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6> + 2302676278U, // <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631604265U, // <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 3842945708U, // <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1> + 3767144133U, // <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1> + 2705346328U, // <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3> + 3779088207U, // <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4> + 2717290420U, // <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6> + 2705346574U, // <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6> + 2705346596U, // <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1> + 1237568822U, // <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 1237568823U, // <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS + 2650914918U, // <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS + 3364490949U, // <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1> + 2248954362U, // <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3> + 2302693144U, // <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3> + 2650918198U, // <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS + 2650918926U, // <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6> + 2302693390U, // <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6> + 1228950838U, // <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228950839U, // <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS + 497467494U, // <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571210036U, // <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571210856U, // <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571211414U, // <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497470774U, // <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571213316U, // <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571213818U, // <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571214956U, // <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7> + 497473326U, // <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS + 497475686U, // <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631606574U, // <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 1571219048U, // <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571219606U, // <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497478967U, // <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631606938U, // <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 1571222010U, // <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1228967222U, // <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497481518U, // <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS + 3768475648U, // <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0> + 2694733926U, // <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 3718711395U, // <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5> + 3384349178U, // <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3> + 2694734162U, // <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5> + 3384347884U, // <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5> + 3730658026U, // <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0> + 3718714362U, // <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2> + 2694734493U, // <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2311278690U, // <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0> + 2305970923U, // <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1> + 3768476566U, // <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0> + 2311279098U, // <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3> + 2311278694U, // <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4> + 3768476783U, // <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1> + 2694735091U, // <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7> + 2311279426U, // <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7> + 2696062357U, // <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7> + 3383701602U, // <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0> + 3768477219U, // <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5> + 3768477288U, // <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2> + 2309960186U, // <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3383701606U, // <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4> + 3768477545U, // <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7> + 3766486970U, // <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7> + 3383702338U, // <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7> + 2309960186U, // <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3768477846U, // <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2> + 3768477975U, // <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5> + 3786393932U, // <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4> + 3768478108U, // <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3> + 2795599115U, // <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5> + 3385037470U, // <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5> + 3780422309U, // <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7> + 3848107301U, // <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4> + 2795894063U, // <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5> + 2795967800U, // <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5> + 3768478690U, // <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0> + 3718744163U, // <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5> + 3784404107U, // <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7> + 2796262748U, // <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5> + 2694737206U, // <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2712653182U, // <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7> + 2713316815U, // <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7> + 2694737449U, // <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2311311458U, // <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0> + 3768479433U, // <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5> + 3768479521U, // <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3> + 2311311866U, // <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3> + 2311311462U, // <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4> + 2248185270U, // <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5> + 2718625879U, // <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7> + 2311312194U, // <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7> + 2311311466U, // <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u> + 2248954874U, // <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2> + 3322696778U, // <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1> + 2248955028U, // <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2656963074U, // <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6> + 2248955238U, // <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6> + 2248955329U, // <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7> + 2656965360U, // <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6> + 2248955500U, // <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7> + 2248955522U, // <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2> + 3718766694U, // <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS + 3724739827U, // <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7> + 3718768739U, // <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5> + 3365826337U, // <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3> + 2798253647U, // <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5> + 3365826258U, // <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5> + 3730715377U, // <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7> + 2310665836U, // <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7> + 2798548595U, // <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5> + 2311336034U, // <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0> + 2694739758U, // <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2248955028U, // <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2311336442U, // <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3> + 2311336038U, // <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4> + 2694740122U, // <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2656981746U, // <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u> + 2311336770U, // <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7> + 2694740325U, // <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2705358848U, // <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0> + 1631617126U, // <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2310607866U, // <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 2302640284U, // <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 2754238189U, // <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1> + 2305296114U, // <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5> + 2244907106U, // <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 2302643528U, // <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631617693U, // <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2627133542U, // <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS + 1237536282U, // <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 1680496430U, // <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1237532828U, // <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 2693416018U, // <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u> + 2756892486U, // <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0> + 2694743284U, // <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u> + 1237536072U, // <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1680496484U, // <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2311288709U, // <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 2245883694U, // <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 2699388520U, // <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2> + 2754238344U, // <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3> + 2699388715U, // <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u> + 2757408666U, // <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3> + 2705360826U, // <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7> + 2302659912U, // <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2754238389U, // <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3> + 2754238396U, // <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1> + 3827980229U, // <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1> + 2644625102U, // <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393820U, // <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1631619588U, // <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u> + 2785056749U, // <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5> + 3363138077U, // <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6> + 2289397064U, // <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634274120U, // <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u> + 1634937753U, // <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u> + 1728272410U, // <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5> + 2710006843U, // <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u> + 2765740076U, // <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5> + 1637592285U, // <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u> + 1631620406U, // <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 2712661375U, // <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u> + 2302676296U, // <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631620649U, // <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 1577091174U, // <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1174443822U, // <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 2766035058U, // <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3> + 1237565596U, // <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 1577094454U, // <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,u,5,5>: Cost 1 vspltisw1 RHS + 1680496794U, // <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1237568840U, // <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 296144182U, // <5,u,5,u>: Cost 1 vspltisw1 RHS + 2633146470U, // <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS + 1175213870U, // <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 2633148309U, // <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6> + 1228947612U, // <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 2633149750U, // <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS + 1175214234U, // <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 1228950018U, // <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 1228950856U, // <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228947617U, // <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 497614950U, // <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571357492U, // <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571358312U, // <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571358870U, // <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497618248U, // <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571360772U, // <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571361274U, // <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571361786U, // <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2> + 497620782U, // <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS + 497623142U, // <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631622958U, // <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 1680496997U, // <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1228963996U, // <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 497626441U, // <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS + 296144182U, // <5,u,u,5>: Cost 1 vspltisw1 RHS + 1680497037U, // <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1228967240U, // <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497628974U, // <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS + 2772451328U, // <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0> + 2772451338U, // <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1> + 3771146417U, // <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6> + 3383095739U, // <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3> + 3846193189U, // <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1> + 3724832803U, // <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0> + 3383095985U, // <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6> + 3383096067U, // <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7> + 2772451401U, // <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1> + 2651095142U, // <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS + 2251612262U, // <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS + 1698709606U, // <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2651097602U, // <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6> + 2651098422U, // <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS + 2651099172U, // <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1> + 2657071869U, // <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1> + 3724841978U, // <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2> + 1698709660U, // <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2252292096U, // <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0> + 1178550374U, // <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 3826655418U, // <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6> + 3777783485U, // <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6> + 2252292434U, // <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5> + 3785746280U, // <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6> + 2252292593U, // <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2> + 3736794583U, // <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2> + 1178550941U, // <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS + 3375153152U, // <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0> + 2772451584U, // <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4> + 3777784163U, // <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0> + 3846193426U, // <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4> + 2712005122U, // <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6> + 3724857382U, // <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3> + 3802335864U, // <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7> + 3801672410U, // <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6> + 2772451647U, // <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4> + 3383123968U, // <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0> + 2772451666U, // <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5> + 3773803577U, // <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6> + 3724864002U, // <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6> + 3846193517U, // <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5> + 2712005935U, // <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0> + 3327009265U, // <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2> + 3383126648U, // <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7> + 2772451729U, // <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5> + 3373178880U, // <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0> + 2254266470U, // <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS + 3785748248U, // <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3> + 3790393190U, // <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0> + 3328000338U, // <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5> + 3785748494U, // <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6> + 3785748516U, // <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1> + 3379153528U, // <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7> + 2254267037U, // <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS + 2254897152U, // <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0> + 1181155430U, // <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 3785748923U, // <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3> + 3785749042U, // <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5> + 2254897490U, // <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5> + 3785749169U, // <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6> + 2724614962U, // <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0> + 3787739982U, // <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1> + 1181155997U, // <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS + 1235664896U, // <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235666598U, // <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 3712943720U, // <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2> + 2639202936U, // <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7> + 2639203638U, // <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS + 2309409236U, // <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 3712946517U, // <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0> + 2309409400U, // <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235666605U, // <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 1235673088U, // <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235674790U, // <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 1698710173U, // <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2639211129U, // <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u> + 2639211830U, // <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS + 2712008858U, // <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS + 2657129220U, // <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u> + 2309417592U, // <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1698710227U, // <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 3775799296U, // <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0> + 2702057574U, // <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3373143763U, // <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2> + 3695045122U, // <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6> + 3775799634U, // <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5> + 3383091538U, // <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5> + 3368493233U, // <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6> + 3362522319U, // <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7> + 2702058141U, // <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3834250027U, // <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1> + 2772452148U, // <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 3832038210U, // <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6> + 3373150660U, // <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3> + 3834250067U, // <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5> + 3373146450U, // <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5> + 3826656102U, // <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6> + 3362530511U, // <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7> + 2772452148U, // <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 2669092966U, // <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS + 2252292916U, // <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1> + 2252293014U, // <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0> + 2772452246U, // <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0> + 2669096246U, // <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS + 3846194091U, // <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3> + 2702059450U, // <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7> + 3870081978U, // <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0> + 2702059633U, // <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1> + 3775801494U, // <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2> + 3777128723U, // <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1> + 3775801702U, // <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3> + 3775801756U, // <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3> + 3775801858U, // <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6> + 3375153490U, // <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5> + 3826656265U, // <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7> + 3775802051U, // <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1> + 3775802142U, // <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2> + 3846194206U, // <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1> + 3846194219U, // <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5> + 3846194228U, // <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5> + 3846194236U, // <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4> + 3846194246U, // <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5> + 2760508496U, // <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6> + 3368526001U, // <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6> + 3870082144U, // <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4> + 2760729707U, // <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6> + 2714668660U, // <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1> + 3834619005U, // <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6> + 3834692742U, // <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6> + 3846194317U, // <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4> + 3834840216U, // <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6> + 3834913953U, // <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6> + 2719977570U, // <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0> + 3367208143U, // <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7> + 2719977724U, // <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1> + 2669125734U, // <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS + 2254897972U, // <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1> + 2254898070U, // <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0> + 3775803929U, // <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7> + 2669129014U, // <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS + 2322006354U, // <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5> + 2725950264U, // <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6> + 3793720142U, // <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1> + 2254898556U, // <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0> + 2627330150U, // <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS + 1235664906U, // <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235667094U, // <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309406894U, // <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2627333430U, // <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS + 1235665234U, // <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309406897U, // <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309407222U, // <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235664913U, // <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 2627338342U, // <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS + 1235673098U, // <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235675286U, // <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2772452732U, // <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0> + 2627341622U, // <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS + 1235673426U, // <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309415089U, // <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309415414U, // <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235673105U, // <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 3324683725U, // <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0> + 2725290086U, // <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS + 3771162801U, // <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6> + 2309349478U, // <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 3730951478U, // <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS + 3840738784U, // <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1> + 3842655721U, // <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1> + 3736925671U, // <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0> + 2309349483U, // <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS + 3367840468U, // <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0> + 3325355551U, // <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1> + 3373147752U, // <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2> + 2299404390U, // <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 3701099830U, // <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS + 3767846054U, // <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2> + 3826656825U, // <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0> + 3373147838U, // <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7> + 2299404395U, // <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS + 2657222758U, // <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS + 3771164219U, // <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2> + 2766481000U, // <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2> + 2772452978U, // <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3> + 2657226038U, // <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS + 3790407528U, // <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6> + 2252294074U, // <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7> + 2252294148U, // <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0> + 2772453023U, // <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3> + 2772453030U, // <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1> + 3834250930U, // <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4> + 2765596349U, // <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6> + 2301411430U, // <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS + 2772453070U, // <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5> + 2765817560U, // <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6> + 2252933050U, // <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7> + 2796340968U, // <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4> + 2766038771U, // <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6> + 3725008998U, // <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS + 3368530217U, // <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1> + 3840222989U, // <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5> + 2309382246U, // <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 3725012278U, // <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS + 2766481193U, // <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6> + 3842656049U, // <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5> + 3327010820U, // <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0> + 2766702404U, // <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6> + 3713073254U, // <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS + 3789082310U, // <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2> + 3840665439U, // <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6> + 2766997352U, // <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6> + 3713076534U, // <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS + 3791736842U, // <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2> + 3373180605U, // <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6> + 3793064108U, // <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2> + 2767366037U, // <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6> + 3701137510U, // <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS + 3701138647U, // <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6> + 2254898792U, // <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2> + 1248264294U, // <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 3701140790U, // <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS + 3725029435U, // <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6> + 2254899130U, // <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7> + 2725294981U, // <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2> + 1248264299U, // <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS + 2633375846U, // <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS + 2309407468U, // <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235666536U, // <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161923174U, // <6,2,7,3>: Cost 1 vmrglw RHS, LHS + 2633379126U, // <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS + 2309407796U, // <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309408445U, // <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309407960U, // <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161923179U, // <6,2,7,u>: Cost 1 vmrglw RHS, LHS + 2633384038U, // <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS + 2309415660U, // <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235674728U, // <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161931366U, // <6,2,u,3>: Cost 1 vmrglw RHS, LHS + 2633387318U, // <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS + 2769135725U, // <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6> + 2309416637U, // <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309416152U, // <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161931371U, // <6,2,u,u>: Cost 1 vmrglw RHS, LHS + 3777806336U, // <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0> + 2704064614U, // <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 3765862577U, // <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6> + 3843393708U, // <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6> + 2250516994U, // <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6> + 3725054014U, // <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0> + 3383093096U, // <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6> + 3368495034U, // <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7> + 2704065181U, // <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 2251622550U, // <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 3777807156U, // <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1> + 3765863348U, // <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3> + 3373147762U, // <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3> + 3834251525U, // <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5> + 3373147683U, // <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5> + 3391727545U, // <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6> + 2299406266U, // <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7> + 2251622550U, // <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 2252294294U, // <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2> + 3326036198U, // <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1> + 3771836045U, // <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3> + 2252294556U, // <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3> + 2252294658U, // <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6> + 3840739677U, // <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3> + 2704066490U, // <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7> + 3368511418U, // <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7> + 2252294942U, // <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2> + 3707158630U, // <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS + 3765864692U, // <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6> + 2704066918U, // <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3> + 2772453788U, // <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3> + 2772453799U, // <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5> + 3789752888U, // <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6> + 3840739770U, // <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6> + 2301413306U, // <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7> + 2775108043U, // <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5> + 2651340902U, // <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS + 3846195674U, // <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2> + 3845974503U, // <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6> + 2651343362U, // <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6> + 2651344182U, // <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS + 1698712066U, // <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6> + 3383125864U, // <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6> + 3368527802U, // <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7> + 1698933277U, // <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6> + 3373179798U, // <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0> + 3707176179U, // <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7> + 2716012312U, // <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3> + 3373180530U, // <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3> + 2254309890U, // <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6> + 3785773070U, // <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6> + 3840739932U, // <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6> + 2299439034U, // <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7> + 2719994110U, // <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3> + 2254899350U, // <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2> + 3328641254U, // <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1> + 2633443257U, // <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6> + 2254899612U, // <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3> + 2254899714U, // <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6> + 3785773772U, // <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6> + 2725966648U, // <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6> + 2322007994U, // <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7> + 2254899998U, // <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2> + 1559707750U, // <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 2633450292U, // <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1> + 1559709626U, // <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7> + 1235666546U, // <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559711030U, // <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS + 2309408291U, // <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2633454152U, // <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0> + 1235666874U, // <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559713582U, // <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 1559715942U, // <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 2633458484U, // <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1> + 1559717819U, // <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u> + 1235674738U, // <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559719222U, // <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS + 1701366598U, // <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6> + 2633462353U, // <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0> + 1235675066U, // <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559721774U, // <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 3785777152U, // <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0> + 2712035430U, // <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3771179185U, // <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6> + 3846196096U, // <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1> + 3785777490U, // <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5> + 2250517814U, // <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS + 3324259703U, // <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0> + 3383092458U, // <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7> + 2712035997U, // <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3325356946U, // <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1> + 3785777972U, // <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1> + 3846196170U, // <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3> + 3325365380U, // <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0> + 3852168155U, // <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2> + 2251615542U, // <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS + 3325357432U, // <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1> + 3870084088U, // <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4> + 2251615785U, // <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS + 2252295058U, // <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1> + 3771180605U, // <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4> + 3785778792U, // <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2> + 3777816253U, // <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6> + 2252295376U, // <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4> + 1178553654U, // <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 2252295545U, // <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2> + 3326037448U, // <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0> + 1178553897U, // <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS + 3785779350U, // <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2> + 3383118648U, // <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1> + 3777816935U, // <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4> + 3785779612U, // <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3> + 2712037890U, // <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6> + 2252754230U, // <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS + 3784452764U, // <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7> + 3801705178U, // <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6> + 2252754473U, // <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS + 3787770770U, // <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1> + 3383126840U, // <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1> + 3327380534U, // <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3> + 3784453265U, // <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4> + 2253630672U, // <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4> + 2778426587U, // <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6> + 3383128789U, // <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6> + 3381799580U, // <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7> + 2778647798U, // <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6> + 2651422822U, // <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS + 3701277928U, // <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5> + 3701278650U, // <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7> + 2651425282U, // <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6> + 2651426102U, // <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS + 2651426892U, // <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5> + 1698712886U, // <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3725169658U, // <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2> + 1698712904U, // <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2254900114U, // <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1> + 3389115192U, // <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1> + 3785781727U, // <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3> + 3785781810U, // <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5> + 2254900432U, // <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4> + 1181158710U, // <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 2254900605U, // <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6> + 3787772750U, // <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1> + 1181158953U, // <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS + 2639495270U, // <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS + 2639496090U, // <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4> + 3707267011U, // <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7> + 2639497884U, // <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7> + 1237658832U, // <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235666638U, // <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 3713241753U, // <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0> + 2309409436U, // <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235666641U, // <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 2639503462U, // <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS + 2639504282U, // <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4> + 3701303226U, // <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7> + 2639506077U, // <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u> + 1235676368U, // <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235674830U, // <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 1698713129U, // <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2309417628U, // <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1698713147U, // <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3775832064U, // <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0> + 2702090342U, // <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3775832241U, // <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6> + 3719227906U, // <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6> + 3775832402U, // <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5> + 3385085146U, // <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5> + 2309351938U, // <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6> + 3376459134U, // <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7> + 2702090909U, // <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3719233546U, // <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1> + 3775832884U, // <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1> + 3775832982U, // <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0> + 3846196909U, // <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4> + 3719236984U, // <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1> + 3856150209U, // <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6> + 3834252997U, // <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1> + 3870084817U, // <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4> + 3769861532U, // <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5> + 2645500006U, // <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS + 3719242548U, // <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1> + 3775833704U, // <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2> + 3775833766U, // <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1> + 2645503353U, // <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2> + 2252296196U, // <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5> + 2702092218U, // <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7> + 3719246842U, // <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2> + 2702092405U, // <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5> + 3775834262U, // <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2> + 3777161495U, // <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5> + 3775834470U, // <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3> + 3775834524U, // <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3> + 3775834626U, // <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6> + 3385109722U, // <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5> + 2309376514U, // <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3775834819U, // <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1> + 2309376514U, // <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3719258214U, // <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS + 3385117586U, // <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1> + 3327242008U, // <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3> + 3719260674U, // <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6> + 3719261563U, // <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4> + 2702093622U, // <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 2309384706U, // <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6> + 3870085060U, // <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4> + 2702093865U, // <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 3719266406U, // <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS + 3789106889U, // <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5> + 3785789208U, // <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3> + 3373183950U, // <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3> + 2717355964U, // <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5> + 2791772164U, // <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5> + 2772455438U, // <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6> + 3373183549U, // <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7> + 2720010496U, // <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5> + 2772455460U, // <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1> + 2322008978U, // <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1> + 3840225335U, // <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2> + 2772455490U, // <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4> + 2772455500U, // <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5> + 2254901252U, // <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5> + 2772455520U, // <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7> + 2785874024U, // <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6> + 2772455532U, // <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1> + 2627625062U, // <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS + 1235667858U, // <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309409278U, // <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309407659U, // <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627628342U, // <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS + 1235668186U, // <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235667458U, // <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309407987U, // <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235667460U, // <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2627633254U, // <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS + 1235676050U, // <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309417470U, // <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309415851U, // <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627636534U, // <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS + 1235676378U, // <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235675650U, // <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309416179U, // <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235675652U, // <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2309352751U, // <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0> + 1650917478U, // <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 2250584570U, // <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3> + 3846197554U, // <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1> + 2724659538U, // <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5> + 3725275225U, // <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0> + 2791772493U, // <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1> + 2309352758U, // <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 1650918045U, // <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 3325358368U, // <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1> + 2299406449U, // <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1> + 2724660118U, // <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0> + 3373148518U, // <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3> + 3834253712U, // <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5> + 3373147953U, // <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5> + 2323297080U, // <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6> + 2299407670U, // <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 2299407671U, // <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS + 2252296489U, // <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1> + 3326038394U, // <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1> + 1178554874U, // <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724660902U, // <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1> + 2252296817U, // <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5> + 3840741864U, // <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3> + 2252296976U, // <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2> + 2785874426U, // <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3> + 1178554874U, // <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724661398U, // <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2> + 3375154665U, // <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1> + 3375154909U, // <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2> + 2301413734U, // <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3> + 2772455986U, // <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5> + 3375154993U, // <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5> + 2323313464U, // <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6> + 2301414710U, // <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 2301414711U, // <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS + 2724662162U, // <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1> + 3326939559U, // <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1> + 2253271546U, // <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3> + 3383127346U, // <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3> + 2309385523U, // <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4> + 1650920758U, // <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 2724662653U, // <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6> + 2309385526U, // <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 1650921001U, // <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 3725312102U, // <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS + 3373180393U, // <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1> + 3791769368U, // <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3> + 3373181286U, // <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3> + 3725315382U, // <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS + 2299439221U, // <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5> + 2724663394U, // <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0> + 2299440438U, // <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 2299440439U, // <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS + 1583808614U, // <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2254574074U, // <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3> + 2322010609U, // <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3> + 1583811894U, // <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2322010773U, // <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5> + 363253046U, // <6,6,6,6>: Cost 1 vspltisw2 RHS + 1248267574U, // <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS + 363253046U, // <6,6,6,u>: Cost 1 vspltisw2 RHS + 2309410095U, // <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0> + 2309408233U, // <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1> + 2311402373U, // <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2> + 2309409126U, // <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 2309410099U, // <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4> + 2309408561U, // <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5> + 1237660472U, // <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 161926454U, // <6,6,7,7>: Cost 1 vmrglw RHS, RHS + 161926455U, // <6,6,7,u>: Cost 1 vmrglw RHS, RHS + 1583808614U, // <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1650923310U, // <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 1178554874U, // <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2309417318U, // <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 1583811894U, // <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1650923674U, // <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 363253046U, // <6,6,u,6>: Cost 1 vspltisw2 RHS + 161934646U, // <6,6,u,7>: Cost 1 vmrglw RHS, RHS + 161934647U, // <6,6,u,u>: Cost 1 vmrglw RHS, RHS + 1638318080U, // <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564576358U, // <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712060077U, // <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712060156U, // <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638318418U, // <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577865314U, // <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0> + 2712060406U, // <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2651608058U, // <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2> + 564576925U, // <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712060643U, // <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638318900U, // <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638318998U, // <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 3766559753U, // <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7> + 2712060971U, // <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712061039U, // <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712061135U, // <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 3373148612U, // <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7> + 1638319484U, // <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712061373U, // <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712061471U, // <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638319720U, // <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638319782U, // <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712061709U, // <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712061800U, // <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1638320058U, // <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252297836U, // <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7> + 1638320187U, // <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638320278U, // <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712062182U, // <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2712062256U, // <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3> + 1638320540U, // <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638320642U, // <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712062546U, // <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712062584U, // <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2712062659U, // <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1> + 1638320926U, // <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638321042U, // <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712062922U, // <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712063029U, // <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712063108U, // <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638321360U, // <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564579638U, // <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712063357U, // <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6> + 2712063439U, // <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7> + 564579881U, // <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712063560U, // <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714054287U, // <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712063742U, // <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 3373181295U, // <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3> + 2712063924U, // <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638322180U, // <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638322274U, // <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 3373181380U, // <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7> + 1640313092U, // <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712064289U, // <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712064423U, // <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638322682U, // <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712064562U, // <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712064653U, // <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712064747U, // <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638323000U, // <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638323022U, // <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638323168U, // <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3> + 1237659746U, // <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309411158U, // <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1> + 2639718330U, // <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7> + 1235669498U, // <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1237659750U, // <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309411243U, // <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5> + 1583895362U, // <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7> + 1235669826U, // <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 1235669503U, // <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u> + 1638323923U, // <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564582190U, // <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638324101U, // <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638324156U, // <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638324287U, // <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564582554U, // <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638324432U, // <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 1235678018U, // <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 564582757U, // <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 1638326272U, // <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564584550U, // <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712068269U, // <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2309349532U, // <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 1638326610U, // <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577939051U, // <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0> + 2712068598U, // <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2309352776U, // <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 564585117U, // <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712068835U, // <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638327092U, // <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1698715438U, // <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2299404444U, // <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 2712069163U, // <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712069231U, // <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712069327U, // <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 2299407688U, // <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 1698715492U, // <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2712069565U, // <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 1178556206U, // <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 1638327912U, // <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638327974U, // <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712069901U, // <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 1178556570U, // <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 1638328250U, // <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252298496U, // <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1> + 1638328379U, // <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638328470U, // <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712070374U, // <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2704107883U, // <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u> + 1638328732U, // <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638328834U, // <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712070738U, // <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712070776U, // <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2301414728U, // <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 1638329118U, // <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638329234U, // <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712071114U, // <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712071221U, // <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2309382300U, // <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 1638329552U, // <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564587831U, // <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712071545U, // <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2309385544U, // <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 564588073U, // <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712071752U, // <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714062479U, // <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712071934U, // <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2299437212U, // <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS + 2712072116U, // <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638330372U, // <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1698715802U, // <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2299440456U, // <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 1698715820U, // <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 1583808614U, // <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1181161262U, // <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 1638330874U, // <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1248264348U, // <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 1583811894U, // <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1181161626U, // <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 363253046U, // <6,u,6,6>: Cost 1 vspltisw2 RHS + 1638331214U, // <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 363253046U, // <6,u,6,u>: Cost 1 vspltisw2 RHS + 1560076390U, // <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS + 1235664969U, // <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1560078311U, // <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7> + 161923228U, // <6,u,7,3>: Cost 1 vmrglw RHS, LHS + 1560079670U, // <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS + 1235665297U, // <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235667485U, // <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 161926472U, // <6,u,7,7>: Cost 1 vmrglw RHS, RHS + 161923233U, // <6,u,7,u>: Cost 1 vmrglw RHS, LHS + 1560084582U, // <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS + 564590382U, // <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS + 1560086504U, // <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u> + 161931420U, // <6,u,u,3>: Cost 1 vmrglw RHS, LHS + 1560087862U, // <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS + 564590746U, // <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS + 363253046U, // <6,u,u,6>: Cost 1 vspltisw2 RHS + 161934664U, // <6,u,u,7>: Cost 1 vmrglw RHS, RHS + 161931425U, // <6,u,u,u>: Cost 1 vmrglw RHS, LHS + 1705426944U, // <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705426954U, // <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1> + 3713550266U, // <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7> + 2316063892U, // <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3> + 2779168805U, // <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1> + 2663698530U, // <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2657727309U, // <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0> + 2316064220U, // <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7> + 1705427017U, // <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1> + 1583988838U, // <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS + 2779168859U, // <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1> + 631685222U, // <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS + 2639817411U, // <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1> + 1583992118U, // <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS + 2657734660U, // <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5> + 1583993678U, // <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1> + 2657735672U, // <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0> + 631685276U, // <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779168933U, // <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3> + 2767667377U, // <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6> + 2718713448U, // <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2> + 2718713510U, // <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1> + 3841409228U, // <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6> + 3852910802U, // <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3> + 2718713786U, // <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7> + 3847160036U, // <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3> + 2767667440U, // <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6> + 2718714006U, // <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2> + 2779169020U, // <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0> + 3852910853U, // <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0> + 2718714268U, // <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3> + 2718714370U, // <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6> + 2718714461U, // <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7> + 2706770608U, // <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0> + 3847160114U, // <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0> + 2779169083U, // <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0> + 2718714770U, // <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1> + 1705427282U, // <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5> + 3713583034U, // <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7> + 3713583814U, // <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4> + 2779169133U, // <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5> + 1644973366U, // <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 2657760081U, // <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4> + 2259468868U, // <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4> + 1705427345U, // <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5> + 2718715508U, // <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1> + 2260123750U, // <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS + 3792457451U, // <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3> + 3852911024U, // <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0> + 2718715836U, // <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5> + 2718715908U, // <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5> + 1644974178U, // <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0> + 3792457853U, // <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0> + 1646301444U, // <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0> + 2720706901U, // <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0> + 2779169270U, // <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7> + 2718716410U, // <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3> + 2722697800U, // <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0> + 3852911121U, // <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7> + 3852911130U, // <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7> + 2718716728U, // <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6> + 2718716750U, // <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1> + 2779169333U, // <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7> + 2718716922U, // <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2> + 1187872870U, // <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2718717076U, // <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3> + 3847160408U, // <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6> + 2718717286U, // <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6> + 2718717377U, // <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7> + 2718717404U, // <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7> + 2718717478U, // <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0> + 1187873437U, // <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS + 1584046182U, // <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS + 1705427602U, // <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1> + 631685789U, // <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS + 2639874762U, // <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u> + 1584049462U, // <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS + 1644976282U, // <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 1584051029U, // <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u> + 2718718208U, // <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1> + 631685843U, // <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS + 2721374218U, // <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1> + 2779169507U, // <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1> + 2779169516U, // <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1> + 3852911348U, // <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0> + 2669743414U, // <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS + 2316058962U, // <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5> + 2316059044U, // <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6> + 2669745146U, // <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2> + 2779169570U, // <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1> + 2779169579U, // <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1> + 1705427764U, // <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169598U, // <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2> + 3713632972U, // <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1> + 2779169619U, // <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5> + 2779169628U, // <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5> + 2657809239U, // <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1> + 3835290474U, // <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1> + 1705427764U, // <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169660U, // <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1> + 2779169671U, // <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3> + 2779169680U, // <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3> + 1705427862U, // <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0> + 2779169700U, // <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5> + 2779169707U, // <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3> + 2657817432U, // <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2> + 2803057594U, // <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0> + 1705427907U, // <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0> + 3776538827U, // <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1> + 2319400970U, // <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1> + 2316085398U, // <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2> + 3852911591U, // <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0> + 3852911600U, // <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0> + 2319401298U, // <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5> + 3833668617U, // <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7> + 3367265487U, // <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7> + 2319400977U, // <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u> + 2724031378U, // <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1> + 2779169835U, // <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5> + 2779169844U, // <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5> + 3852911672U, // <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0> + 2669776182U, // <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS + 2779169872U, // <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6> + 3835290712U, // <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5> + 2669778278U, // <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6> + 2779169898U, // <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5> + 2779169903U, // <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1> + 3835585661U, // <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6> + 3841410182U, // <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6> + 3852911753U, // <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0> + 2779169943U, // <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5> + 2318754130U, // <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5> + 2718724195U, // <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1> + 3859178670U, // <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1> + 2779169975U, // <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1> + 2720715094U, // <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1> + 2761549007U, // <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7> + 2779170008U, // <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7> + 3835438305U, // <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7> + 3835512042U, // <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7> + 2761843955U, // <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7> + 3835659516U, // <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7> + 2803057918U, // <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0> + 2762065166U, // <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7> + 2669797478U, // <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS + 2322087946U, // <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1> + 2317448186U, // <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2> + 3395829934U, // <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3> + 2669800758U, // <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS + 2322088274U, // <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5> + 3375923377U, // <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6> + 2731996780U, // <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7> + 2322087953U, // <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u> + 2779170146U, // <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1> + 1705427764U, // <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779170164U, // <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1> + 1705428348U, // <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0> + 2779170186U, // <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5> + 2763171221U, // <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7> + 2657866590U, // <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u> + 2803058080U, // <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0> + 1705428393U, // <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0> + 3713695846U, // <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS + 2779170237U, // <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2> + 2779170245U, // <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1> + 1242316902U, // <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 3713699126U, // <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS + 3852912096U, // <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1> + 2767668713U, // <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1> + 2256488426U, // <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1> + 1242316907U, // <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS + 3852912132U, // <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1> + 3852912141U, // <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1> + 3852912149U, // <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0> + 2779170335U, // <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1> + 3852912172U, // <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5> + 3840747062U, // <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6> + 3841410617U, // <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0> + 3795125538U, // <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0> + 2779170380U, // <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1> + 2779170389U, // <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1> + 3852912222U, // <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1> + 1705428584U, // <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705428594U, // <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3> + 2779170429U, // <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5> + 3852912259U, // <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2> + 2767668880U, // <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6> + 3841336981U, // <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2> + 1705428639U, // <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3> + 1705428646U, // <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1> + 2779170479U, // <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1> + 2767668925U, // <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6> + 1245659238U, // <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705428686U, // <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5> + 2779170519U, // <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5> + 2657899362U, // <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3> + 2319406574U, // <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7> + 1705428718U, // <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1> + 3713728614U, // <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS + 3852912388U, // <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5> + 2779170573U, // <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5> + 1242349670U, // <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 3713731894U, // <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS + 2779170601U, // <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6> + 2767669041U, // <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5> + 3389834456U, // <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7> + 1242349675U, // <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS + 3852912456U, // <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1> + 3852912466U, // <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2> + 3852912475U, // <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2> + 2779170664U, // <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6> + 3852912496U, // <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5> + 3792474116U, // <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5> + 2718732388U, // <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2> + 3841337228U, // <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6> + 2779170709U, // <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6> + 2640003174U, // <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS + 2721386920U, // <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2> + 2767595441U, // <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7> + 1693927354U, // <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7> + 2640006454U, // <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS + 3841558476U, // <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7> + 2657923941U, // <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6> + 3841337310U, // <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7> + 1694296039U, // <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7> + 2803058666U, // <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1> + 3852912632U, // <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6> + 2322089576U, // <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2> + 1248346214U, // <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 3841337362U, // <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5> + 3395830836U, // <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5> + 2261616570U, // <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7> + 3371943857U, // <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7> + 1248346219U, // <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705429051U, // <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1> + 2779170884U, // <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1> + 1705428584U, // <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1695254620U, // <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7> + 1705429091U, // <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5> + 2779170924U, // <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5> + 2767669361U, // <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1> + 2803058809U, // <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0> + 1695623305U, // <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7> + 2779170955U, // <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0> + 1705429142U, // <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2> + 2634057732U, // <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0> + 2779170983U, // <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1> + 2779170992U, // <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1> + 3852912829U, // <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5> + 2657948520U, // <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0> + 2316060602U, // <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7> + 1705429205U, // <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2> + 3852912860U, // <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0> + 2779171046U, // <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1> + 2779171057U, // <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3> + 3852912887U, // <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0> + 3852912896U, // <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0> + 3852912905U, // <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0> + 3835291923U, // <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1> + 3841411356U, // <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1> + 2779171111U, // <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3> + 2779171120U, // <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3> + 3852912952U, // <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2> + 2779171137U, // <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2> + 2779171144U, // <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0> + 2779171156U, // <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3> + 3852912989U, // <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3> + 2767669606U, // <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3> + 2767669615U, // <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3> + 2779171189U, // <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0> + 2779171198U, // <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0> + 3852913032U, // <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1> + 2704140655U, // <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3> + 1705429404U, // <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171238U, // <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4> + 3852913070U, // <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3> + 2657973099U, // <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3> + 2767669700U, // <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7> + 1705429404U, // <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171280U, // <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1> + 2779171290U, // <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2> + 2634090504U, // <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4> + 2779171311U, // <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5> + 2779171319U, // <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4> + 1705429506U, // <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6> + 2722057593U, // <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2> + 2316093370U, // <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7> + 1705429533U, // <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6> + 3852913185U, // <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1> + 3795799695U, // <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1> + 3852913203U, // <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1> + 3852913214U, // <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3> + 3852913225U, // <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5> + 2779171410U, // <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5> + 2718740581U, // <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3> + 3841411685U, // <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6> + 2720067847U, // <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3> + 2773420664U, // <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7> + 3847236225U, // <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7> + 1648316922U, // <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3> + 2773641875U, // <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7> + 2773715612U, // <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7> + 3847531173U, // <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7> + 2722059024U, // <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2> + 2767669943U, // <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7> + 1652298720U, // <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3> + 2767669955U, // <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1> + 3841411788U, // <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1> + 2767669978U, // <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6> + 2722059546U, // <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2> + 2767669995U, // <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5> + 3852913396U, // <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5> + 2722059758U, // <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7> + 2302183354U, // <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7> + 2767670027U, // <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1> + 2774747930U, // <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7> + 1705429790U, // <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2> + 1660262316U, // <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3> + 1705429404U, // <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2775042878U, // <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7> + 1705429830U, // <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6> + 2779171660U, // <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3> + 2767670101U, // <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3> + 1705429853U, // <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2> + 2718744576U, // <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0> + 1645002854U, // <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 3852913527U, // <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1> + 3852913536U, // <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1> + 2316061904U, // <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4> + 1705429906U, // <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658022257U, // <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0> + 2256489928U, // <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1707420589U, // <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1> + 3852913590U, // <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1> + 2718745396U, // <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1> + 2779171786U, // <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3> + 3852913616U, // <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0> + 3852913627U, // <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2> + 2779171810U, // <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0> + 3792487631U, // <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7> + 3394456220U, // <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7> + 2779171837U, // <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0> + 3852913673U, // <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3> + 3852913682U, // <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3> + 2718746216U, // <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2> + 2718746278U, // <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1> + 2779171885U, // <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3> + 2779171893U, // <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2> + 2718746554U, // <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7> + 3847457864U, // <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3> + 2779171921U, // <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3> + 2718746774U, // <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2> + 3852913762U, // <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2> + 3852913772U, // <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3> + 2718747036U, // <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3> + 2718747138U, // <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6> + 2779171972U, // <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0> + 2706803380U, // <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4> + 3847457946U, // <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4> + 2781162655U, // <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0> + 2718747538U, // <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1> + 3852913842U, // <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1> + 3852913852U, // <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2> + 2316096696U, // <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3> + 1705430224U, // <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705430234U, // <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5> + 2658055029U, // <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4> + 2316097024U, // <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7> + 1707420917U, // <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5> + 1584316518U, // <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS + 2658059060U, // <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1> + 2640144314U, // <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7> + 2640145131U, // <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5> + 1584319798U, // <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS + 2779172134U, // <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0> + 631688502U, // <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS + 2658063354U, // <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2> + 631688520U, // <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS + 3852914001U, // <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7> + 3852914010U, // <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7> + 2718749178U, // <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3> + 2722730572U, // <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4> + 2723394205U, // <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4> + 2779172221U, // <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6> + 2718749496U, // <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6> + 2718749518U, // <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1> + 2779172249U, // <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7> + 2718749690U, // <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2> + 3847458214U, // <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2> + 2718749880U, // <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3> + 3847458236U, // <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6> + 2718750004U, // <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1> + 1187876150U, // <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2718750208U, // <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7> + 2718750286U, // <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4> + 1187876393U, // <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS + 1584341094U, // <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS + 1645008686U, // <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 2640168890U, // <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7> + 2640169710U, // <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u> + 1584344374U, // <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS + 1705430554U, // <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1> + 631688745U, // <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS + 2718750976U, // <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1> + 631688763U, // <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS + 2646147174U, // <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS + 2779172424U, // <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2> + 3852914258U, // <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3> + 3852914268U, // <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4> + 2779172450U, // <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1> + 2316061914U, // <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5> + 2316061186U, // <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6> + 2646152186U, // <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2> + 2779172486U, // <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1> + 2781163151U, // <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1> + 2321378194U, // <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1> + 3852914339U, // <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3> + 3852914350U, // <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5> + 2781163191U, // <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5> + 3852914363U, // <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0> + 3835588297U, // <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5> + 3835588306U, // <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5> + 2781163223U, // <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1> + 3852914400U, // <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1> + 2781163243U, // <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3> + 3852914419U, // <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2> + 2779172606U, // <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4> + 3780552497U, // <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5> + 2781163279U, // <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2779172632U, // <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3> + 3835588385U, // <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3> + 2779172650U, // <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3> + 3852914481U, // <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1> + 2319403922U, // <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1> + 2319404409U, // <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 3852914510U, // <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3> + 3779226131U, // <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5> + 2319404250U, // <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5> + 2319403522U, // <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6> + 3852914547U, // <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4> + 2319403524U, // <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u> + 2646179942U, // <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS + 2316094354U, // <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1> + 3852914582U, // <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3> + 3852914592U, // <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4> + 2646183372U, // <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4> + 2779172788U, // <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6> + 2316093954U, // <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6> + 2646185318U, // <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6> + 2779172815U, // <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6> + 2781163475U, // <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1> + 2781163484U, // <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1> + 3852914662U, // <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2> + 3852914672U, // <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3> + 2781163515U, // <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5> + 1705431044U, // <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172878U, // <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6> + 3835588632U, // <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7> + 1705431044U, // <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172900U, // <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1> + 2781163571U, // <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7> + 3852914743U, // <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2> + 2779172930U, // <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4> + 2779172940U, // <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5> + 2781163607U, // <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 2779172960U, // <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7> + 1705431138U, // <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0> + 1705578603U, // <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0> + 2646204518U, // <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2322090898U, // <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1> + 3719947880U, // <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2> + 3719948438U, // <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2> + 2646207951U, // <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7> + 2322091226U, // <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5> + 2322090498U, // <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6> + 2646210156U, // <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7> + 2646210350U, // <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2779173062U, // <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1> + 2779173072U, // <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2> + 2319404409U, // <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 2779173092U, // <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4> + 2779173101U, // <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4> + 1705431044U, // <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779173118U, // <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3> + 1705578756U, // <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0> + 1707421965U, // <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0> + 3852914966U, // <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0> + 2779173153U, // <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2> + 2256491002U, // <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3> + 3852914994U, // <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1> + 3852915003U, // <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1> + 2316062652U, // <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316063544U, // <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6> + 1242320182U, // <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1242320183U, // <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS + 3852915048U, // <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1> + 3377866217U, // <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1> + 3852915068U, // <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3> + 3833672072U, // <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6> + 3852915088U, // <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5> + 3395122056U, // <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5> + 3389813560U, // <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6> + 2779173287U, // <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1> + 2779320752U, // <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1> + 2658181222U, // <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS + 3852915140U, // <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3> + 2257973754U, // <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3> + 3841413589U, // <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2> + 2658184502U, // <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS + 3852915176U, // <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3> + 2658186117U, // <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2> + 1705431546U, // <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3> + 1705579011U, // <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3> + 3714015334U, // <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS + 3777243425U, // <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6> + 2319405957U, // <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2> + 3375229286U, // <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3> + 2779173426U, // <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5> + 3375228721U, // <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5> + 2319405880U, // <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6> + 1245662518U, // <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1245662519U, // <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS + 3852915291U, // <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1> + 3389834729U, // <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1> + 2259472890U, // <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3> + 3852915321U, // <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4> + 3852915330U, // <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4> + 2779173517U, // <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6> + 2316096312U, // <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6> + 1242352950U, // <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1242352951U, // <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS + 3852915372U, // <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1> + 3835294392U, // <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4> + 3852915395U, // <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6> + 3852915404U, // <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6> + 3852915412U, // <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5> + 3377899313U, // <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5> + 2718765160U, // <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6> + 2779173611U, // <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1> + 2779321076U, // <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1> + 2658213990U, // <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS + 3852915462U, // <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1> + 2718765562U, // <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3> + 3714042622U, // <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6> + 2658217270U, // <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS + 2724074224U, // <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6> + 1705431864U, // <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705431874U, // <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7> + 1705579339U, // <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7> + 1705431886U, // <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1> + 2779173719U, // <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1> + 2779173729U, // <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2> + 2779173736U, // <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0> + 1705431926U, // <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5> + 2779173759U, // <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5> + 2779173765U, // <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2> + 1248349494U, // <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS + 1705431958U, // <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1> + 1705579423U, // <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1> + 2779173801U, // <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2> + 2779321266U, // <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2> + 2779321273U, // <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0> + 1705579463U, // <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5> + 2779173841U, // <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6> + 1705431864U, // <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705432032U, // <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3> + 1705579495U, // <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1> + 1242320994U, // <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705432058U, // <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2> + 3841414146U, // <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1> + 2316063226U, // <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3> + 2779173908U, // <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1> + 2658242658U, // <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0> + 2658243468U, // <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0> + 2316063554U, // <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7> + 1705432121U, // <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2> + 3852915777U, // <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1> + 2779173962U, // <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1> + 2779173973U, // <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3> + 3389813242U, // <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3> + 3852915813U, // <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1> + 3852915821U, // <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0> + 3835294839U, // <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1> + 2329343596U, // <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7> + 2779174027U, // <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3> + 2803061908U, // <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3> + 3852915869U, // <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3> + 2779174053U, // <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2> + 2779174060U, // <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0> + 2803061944U, // <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3> + 3852915905U, // <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3> + 2767672522U, // <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3> + 2791855315U, // <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3> + 2768999644U, // <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3> + 2779174115U, // <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1> + 3852915948U, // <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1> + 3841414394U, // <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6> + 1245663738U, // <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174155U, // <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5> + 3852915988U, // <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5> + 2706827959U, // <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7> + 2319405890U, // <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7> + 1245663738U, // <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174200U, // <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5> + 3852916030U, // <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2> + 3714099130U, // <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7> + 2316095994U, // <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3> + 1242353766U, // <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705432422U, // <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6> + 2658276240U, // <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4> + 2316096322U, // <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7> + 1705432449U, // <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6> + 3852916101U, // <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1> + 3854906765U, // <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0> + 3852916121U, // <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3> + 3389846010U, // <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3> + 3852916141U, // <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5> + 2779174326U, // <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5> + 2779174337U, // <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7> + 2329376364U, // <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7> + 2779321811U, // <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7> + 2658287718U, // <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS + 3852916197U, // <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7> + 2779174382U, // <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7> + 2316112378U, // <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3> + 2658290998U, // <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS + 3852916233U, // <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7> + 1651004226U, // <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7> + 2779174420U, // <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0> + 1652331492U, // <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7> + 1590526054U, // <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS + 2328728623U, // <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1> + 2724746451U, // <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3> + 2322092538U, // <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3> + 1590529334U, // <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS + 2328728951U, // <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5> + 2724746770U, // <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7> + 430361910U, // <7,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,7,u>: Cost 1 vspltisw3 RHS + 1242320994U, // <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705580162U, // <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2> + 2779321996U, // <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3> + 1245663738U, // <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 1242353766U, // <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705580202U, // <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6> + 1662949620U, // <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7> + 430361910U, // <7,7,u,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,u,u>: Cost 1 vspltisw3 RHS + 1705426944U, // <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705432787U, // <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2> + 2316060885U, // <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2> + 1242316956U, // <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 2779174637U, // <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1> + 1182750874U, // <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS + 2316061213U, // <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6> + 1242320200U, // <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1705432850U, // <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2> + 1584578662U, // <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS + 1705427764U, // <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 631691054U, // <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS + 2640407307U, // <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1> + 1584581942U, // <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS + 2779174726U, // <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0> + 1584583574U, // <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1> + 2779322201U, // <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1> + 631691108U, // <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779174763U, // <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1> + 2779174774U, // <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3> + 1705428584U, // <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705432965U, // <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0> + 2779174801U, // <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3> + 2779174810U, // <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3> + 2767673251U, // <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3> + 1705580460U, // <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3> + 1705433010U, // <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0> + 1705433020U, // <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1> + 2779174853U, // <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1> + 2767673299U, // <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6> + 1245659292U, // <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705433060U, // <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5> + 2779174893U, // <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5> + 2706836152U, // <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u> + 1245662536U, // <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1705433092U, // <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1> + 2779174925U, // <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1> + 1185732398U, // <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS + 2316093653U, // <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2> + 1242349724U, // <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 1705430224U, // <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705433151U, // <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6> + 2316093981U, // <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6> + 1242352968U, // <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1705433178U, // <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6> + 1584611430U, // <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS + 2781165670U, // <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0> + 2640439226U, // <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7> + 2640440079U, // <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5> + 1584614710U, // <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS + 1705431044U, // <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 631691418U, // <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS + 2779322525U, // <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1> + 631691436U, // <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS + 2779175087U, // <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1> + 2779175102U, // <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7> + 1648357887U, // <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u> + 1705433296U, // <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7> + 2779175127U, // <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5> + 2779175138U, // <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7> + 1651012419U, // <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u> + 1705580788U, // <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7> + 1705433341U, // <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7> + 1705580800U, // <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1> + 1187878702U, // <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2768042263U, // <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6> + 1248346268U, // <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705580840U, // <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5> + 1187879066U, // <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2779322679U, // <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2> + 430361910U, // <7,u,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,u,7,u>: Cost 1 vspltisw3 RHS + 1705433425U, // <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1> + 1705433435U, // <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2> + 631691621U, // <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS + 1705433451U, // <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0> + 1705433465U, // <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5> + 1705433475U, // <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6> + 631691661U, // <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS + 430361910U, // <7,u,u,7>: Cost 1 vspltisw3 RHS + 631691675U, // <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS + 202162278U, // <u,0,0,0>: Cost 1 vspltisw0 LHS + 1678598154U, // <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2634500154U, // <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0> + 2289596269U, // <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3> + 1548815670U, // <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS + 2663698530U, // <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2658390942U, // <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0> + 2289596597U, // <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7> + 202162278U, // <u,0,0,u>: Cost 1 vspltisw0 LHS + 1560764518U, // <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS + 115720294U, // <u,0,1,1>: Cost 1 vmrghw LHS, LHS + 604856427U, // <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2634508438U, // <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2> + 1560767798U, // <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS + 2652426438U, // <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1> + 1584657311U, // <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1> + 2658399226U, // <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2> + 604856476U, // <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696889850U, // <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0> + 1190174822U, // <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 2692245096U, // <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2> + 2692245158U, // <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1> + 2263916882U, // <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5> + 2299709908U, // <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 2692245434U, // <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7> + 2701535281U, // <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0> + 1190175389U, // <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS + 1209237504U, // <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1209239206U, // <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2704189813U, // <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0> + 2692245916U, // <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3> + 2282981033U, // <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2664386658U, // <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0> + 2691877496U, // <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 2664388218U, // <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3> + 1209239213U, // <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 2289623040U, // <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0> + 1678598482U, // <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2634532926U, // <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4> + 2235580672U, // <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 1143619922U, // <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1618505014U, // <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 2658423714U, // <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4> + 2713259464U, // <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1683243409U, // <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 1192443904U, // <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 118702182U, // <u,0,5,1>: Cost 1 vmrghw RHS, LHS + 2266185901U, // <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2640513816U, // <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5> + 1192444242U, // <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2718789636U, // <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5> + 1645047915U, // <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0> + 2664404604U, // <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5> + 118702749U, // <u,0,5,u>: Cost 1 vmrghw RHS, LHS + 2302910464U, // <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0> + 1192886374U, // <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 2718790138U, // <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3> + 2722771537U, // <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0> + 2266628434U, // <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5> + 2248950180U, // <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 2718790456U, // <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6> + 2718790478U, // <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1> + 1192886941U, // <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS + 1235812352U, // <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235814054U, // <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 2728080601U, // <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0> + 2640530202U, // <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7> + 2640530742U, // <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS + 2309556692U, // <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 2730735133U, // <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0> + 2309556856U, // <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235814061U, // <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 202162278U, // <u,0,u,0>: Cost 1 vspltisw0 LHS + 120365158U, // <u,0,u,1>: Cost 1 vmrghw LHS, LHS + 604856989U, // <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2692249532U, // <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1> + 1560825142U, // <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS + 1618507930U, // <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 1584714662U, // <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u> + 2309565048U, // <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 604857043U, // <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 1611210825U, // <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1> + 1616519270U, // <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS + 2287605459U, // <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2> + 2640546588U, // <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0> + 2622631222U, // <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS + 2289590610U, // <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5> + 2664436630U, // <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1> + 2664437376U, // <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0> + 1616519889U, // <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1> + 1548894866U, // <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1> + 269271142U, // <u,1,1,1>: Cost 1 vspltisw1 LHS + 1189462934U, // <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2622638230U, // <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2> + 1548897590U, // <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS + 2756985692U, // <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 2658472872U, // <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1> + 2287614142U, // <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7> + 269271142U, // <u,1,1,u>: Cost 1 vspltisw1 LHS + 1566818406U, // <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS + 2756985735U, // <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 1148371862U, // <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1566821686U, // <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS + 2756985771U, // <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 2690262970U, // <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7> + 1590711938U, // <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 2282979337U, // <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1209237514U, // <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1209239702U, // <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282979502U, // <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282979341U, // <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1209237842U, // <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282979505U, // <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287625423U, // <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1209237521U, // <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 1635101613U, // <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1> + 2289623050U, // <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1> + 2289625238U, // <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2> + 2640579360U, // <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4> + 2622663990U, // <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS + 1616522550U, // <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 2664469398U, // <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1> + 2664470148U, // <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4> + 1616522793U, // <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 1548927638U, // <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5> + 1192444724U, // <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1192444822U, // <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2622670998U, // <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2> + 1548930358U, // <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS + 1210728786U, // <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2714153058U, // <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0> + 2670449658U, // <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2> + 1548932910U, // <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS + 2622677655U, // <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6> + 2756986063U, // <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2302912662U, // <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2> + 3696421014U, // <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2> + 2622680374U, // <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS + 2756986099U, // <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 2714153784U, // <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6> + 1651692438U, // <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1> + 1652356071U, // <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1> + 2628657254U, // <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS + 1235812362U, // <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235814550U, // <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309554350U, // <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2628660534U, // <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS + 1235812690U, // <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309554353U, // <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309554678U, // <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235812369U, // <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 1548952217U, // <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u> + 269271142U, // <u,1,u,1>: Cost 1 vspltisw1 LHS + 1209280662U, // <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1548954934U, // <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS + 1209278802U, // <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2283020465U, // <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 1590761096U, // <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 2702876672U, // <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0> + 1629134950U, // <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS + 2289591912U, // <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2> + 1215848550U, // <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 2702877010U, // <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5> + 2289222708U, // <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2779178473U, // <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1> + 2726249024U, // <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1215848555U, // <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS + 2690933539U, // <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2> + 2628683124U, // <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1> + 1189463656U, // <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1213866086U, // <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 2628685110U, // <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS + 2263205736U, // <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6> + 1189463994U, // <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2263205866U, // <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1213866091U, // <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS + 1556938854U, // <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2697569869U, // <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2> + 336380006U, // <u,2,2,2>: Cost 1 vspltisw2 LHS + 1678599794U, // <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 1556942134U, // <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2702878650U, // <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7> + 2300229831U, // <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7> + 336380006U, // <u,2,2,u>: Cost 1 vspltisw2 LHS + 475243165U, // <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1548985140U, // <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1209239144U, // <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135495782U, // <u,2,3,3>: Cost 1 vmrglw LHS, LHS + 475245878U, // <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1596764164U, // <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1596764666U, // <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1596765178U, // <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135495787U, // <u,2,3,u>: Cost 1 vmrglw LHS, LHS + 2708851630U, // <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2> + 2217362979U, // <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2289624680U, // <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2> + 1215881318U, // <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 2726767824U, // <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4> + 1629138230U, // <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 2779178801U, // <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5> + 2726251976U, // <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1215881323U, // <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS + 2628714598U, // <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS + 2628715896U, // <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5> + 1192445544U, // <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1213898854U, // <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 2628717878U, // <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS + 2726768644U, // <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5> + 1192445882U, // <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2266187754U, // <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1213898859U, // <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS + 2634694758U, // <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS + 2721460657U, // <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2> + 2296940136U, // <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2> + 1678600122U, // <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2634698038U, // <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS + 3370682125U, // <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5> + 1157056442U, // <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725442455U, // <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2> + 1678600167U, // <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 1653027897U, // <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2> + 2309554924U, // <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235813992U, // <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 162070630U, // <u,2,7,3>: Cost 1 vmrglw RHS, LHS + 2634706230U, // <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS + 2309555252U, // <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309555901U, // <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309555416U, // <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 162070635U, // <u,2,7,u>: Cost 1 vmrglw RHS, LHS + 475284130U, // <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1549026100U, // <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 336380006U, // <u,2,u,2>: Cost 1 vspltisw2 LHS + 135536742U, // <u,2,u,3>: Cost 1 vmrglw LHS, LHS + 475286838U, // <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1629141146U, // <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 1194108858U, // <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 1596806138U, // <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135536747U, // <u,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611890688U, // <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 538149020U, // <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685632685U, // <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685632764U, // <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611891026U, // <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2733408722U, // <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2658612153U, // <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0> + 2289592250U, // <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7> + 538149533U, // <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 1189464214U, // <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 1611891508U, // <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611891606U, // <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 1189464476U, // <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1189464578U, // <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2690278511U, // <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2690278607U, // <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2287609786U, // <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7> + 1611892092U, // <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2685634042U, // <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0> + 2685634079U, // <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611892328U, // <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611892390U, // <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2685634371U, // <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5> + 2685634453U, // <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6> + 1611892666U, // <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2300225466U, // <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7> + 1611892795U, // <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1> + 1209238422U, // <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282980247U, // <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1> + 1561004120U, // <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3> + 403488870U, // <u,3,3,3>: Cost 1 vspltisw3 LHS + 1209238426U, // <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282980899U, // <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2282985598U, // <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1209239482U, // <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 403488870U, // <u,3,3,u>: Cost 1 vspltisw3 LHS + 1555038310U, // <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS + 1555039616U, // <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4> + 2628781672U, // <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2> + 2289624690U, // <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3> + 1555041590U, // <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS + 538152246U, // <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2658644925U, // <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4> + 2289625018U, // <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7> + 538152489U, // <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 1192446102U, // <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2733411983U, // <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2634762330U, // <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5> + 1192446364U, // <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1192446466U, // <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 1659670532U, // <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659670626U, // <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 2287642554U, // <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7> + 1659670788U, // <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2634768486U, // <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS + 2733412775U, // <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1648390659U, // <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3> + 2634770973U, // <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6> + 2634771766U, // <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS + 2733413099U, // <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659671352U, // <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659671374U, // <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1652372457U, // <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3> + 1561034854U, // <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 2634777396U, // <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1> + 1561036892U, // <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7> + 1235814002U, // <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1561038134U, // <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS + 2309555747U, // <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2309556072U, // <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6> + 1235814330U, // <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1561040686U, // <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 1611896531U, // <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2> + 538154798U, // <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1611896712U, // <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3> + 403488870U, // <u,3,u,3>: Cost 1 vspltisw3 LHS + 1611896895U, // <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6> + 538155162U, // <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1611897040U, // <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1209280442U, // <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 538155365U, // <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 1165118354U, // <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1618534502U, // <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 2634795102U, // <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0> + 2686451968U, // <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2692276562U, // <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5> + 1705438098U, // <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658685890U, // <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0> + 2256489928U, // <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1618535069U, // <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1189464978U, // <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2692277044U, // <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1> + 1618535367U, // <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4> + 2640775992U, // <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1> + 1189465296U, // <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 115723574U, // <u,4,1,5>: Cost 1 vmrghw LHS, RHS + 2263207289U, // <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2664666780U, // <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1> + 115723817U, // <u,4,1,u>: Cost 1 vmrghw LHS, RHS + 2263919506U, // <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1> + 2222115812U, // <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 2692277864U, // <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2> + 2692277926U, // <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1> + 2324114640U, // <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4> + 1190178102U, // <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278202U, // <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7> + 2701568053U, // <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4> + 1190178345U, // <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278422U, // <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2> + 2282981552U, // <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2704222585U, // <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4> + 2692278684U, // <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3> + 1257016528U, // <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1209239246U, // <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2691910300U, // <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 2664683166U, // <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3> + 1209239249U, // <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 1573027942U, // <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS + 2634826695U, // <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4> + 2634827874U, // <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4> + 2289629073U, // <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3> + 229035318U, // <u,4,4,4>: Cost 1 vspltisw0 RHS + 1618537782U, // <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS + 2658718662U, // <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4> + 2289629401U, // <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7> + 229035318U, // <u,4,4,u>: Cost 1 vspltisw0 RHS + 1561092198U, // <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS + 2628863370U, // <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5> + 1561094243U, // <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5> + 2634836118U, // <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2> + 1561095478U, // <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS + 118705462U, // <u,4,5,5>: Cost 1 vmrghw RHS, RHS + 604859702U, // <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2658726906U, // <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2> + 604859720U, // <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 2266631058U, // <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1> + 2302692152U, // <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 2718822906U, // <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3> + 2722804309U, // <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4> + 2723467942U, // <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4> + 1192889654U, // <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 2718823224U, // <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6> + 2718823246U, // <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1> + 1192889897U, // <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS + 2640822374U, // <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS + 2640823194U, // <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4> + 2728113373U, // <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4> + 2640825150U, // <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7> + 1235815632U, // <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235814094U, // <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 2730767905U, // <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4> + 2309556892U, // <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235814097U, // <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 1561116774U, // <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS + 1618540334U, // <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1561118822U, // <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u> + 2692282300U, // <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1> + 229035318U, // <u,4,u,4>: Cost 1 vspltisw0 RHS + 120368438U, // <u,4,u,5>: Cost 1 vmrghw LHS, RHS + 604859945U, // <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2309565084U, // <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 604859963U, // <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2690293760U, // <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0> + 1616552038U, // <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2640840434U, // <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5> + 2640841536U, // <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0> + 1613381970U, // <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2316135642U, // <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5> + 2289592834U, // <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6> + 2664732324U, // <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0> + 1616552661U, // <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5> + 1573077094U, // <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 1237536282U, // <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2690294678U, // <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0> + 2646821014U, // <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2> + 1573080602U, // <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1> + 1189466116U, // <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1189466210U, // <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2646823930U, // <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2> + 1573082926U, // <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 2640855142U, // <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS + 2697594448U, // <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5> + 2690295400U, // <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2> + 1625179890U, // <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5> + 2699585347U, // <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5> + 2781171471U, // <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2690295738U, // <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7> + 3775318070U, // <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5> + 1628498055U, // <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5> + 2287627234U, // <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1257016210U, // <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2646836942U, // <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5> + 2287625131U, // <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287627238U, // <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1257016538U, // <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1209240066U, // <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287625459U, // <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1209240068U, // <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 2640871526U, // <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS + 2316168082U, // <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1> + 2640873202U, // <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5> + 2640874308U, // <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4> + 1637788917U, // <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5> + 1616555318U, // <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 2287638591U, // <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6> + 2664765096U, // <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4> + 1616555561U, // <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 1573109862U, // <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS + 2646852404U, // <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1> + 2646853224U, // <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2> + 2287646618U, // <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3> + 1573113374U, // <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5> + 296144182U, // <u,5,5,5>: Cost 1 vspltisw1 RHS + 1192448098U, // <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2287646946U, // <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7> + 296144182U, // <u,5,5,u>: Cost 1 vspltisw1 RHS + 1567146086U, // <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS + 2628945300U, // <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6> + 2634917997U, // <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6> + 1567148870U, // <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6> + 1567149366U, // <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS + 2781171799U, // <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 1228950018U, // <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 2628952166U, // <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS + 1235815314U, // <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309556734U, // <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309555115U, // <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2628955446U, // <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS + 1235815642U, // <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235814914U, // <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309555443U, // <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235814916U, // <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 1567162470U, // <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS + 1616557870U, // <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2690299781U, // <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0> + 1567165256U, // <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u> + 1567165750U, // <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS + 296144182U, // <u,5,u,5>: Cost 1 vspltisw1 RHS + 1209281026U, // <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2705563648U, // <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0> + 1631821926U, // <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 2262462970U, // <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3> + 2646886941U, // <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6> + 2705563986U, // <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5> + 2316062652U, // <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316137272U, // <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6> + 1215851830U, // <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 1215851831U, // <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS + 2634948710U, // <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS + 2705564468U, // <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1> + 1189466618U, // <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2263208498U, // <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2693620843U, // <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6> + 2652868860U, // <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1> + 1189466936U, // <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1213869366U, // <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 1213869367U, // <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS + 2658844774U, // <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS + 3771344465U, // <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6> + 1178554874U, // <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2698929907U, // <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6> + 2699593540U, // <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6> + 2700257173U, // <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6> + 2705565626U, // <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7> + 1226485046U, // <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 1226485047U, // <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS + 2705565846U, // <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2> + 2330756585U, // <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2330756829U, // <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2> + 2282981734U, // <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 1631824413U, // <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6> + 2652885246U, // <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3> + 1257018168U, // <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135499062U, // <u,6,3,7>: Cost 1 vmrglw LHS, RHS + 135499063U, // <u,6,3,u>: Cost 1 vmrglw LHS, RHS + 2646917222U, // <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS + 2217365931U, // <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2790167156U, // <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u> + 2646919709U, // <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6> + 2711538934U, // <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6> + 1631825206U, // <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 2316170040U, // <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6> + 1215884598U, // <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 1215884599U, // <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS + 2634981478U, // <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS + 2266190247U, // <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1192448506U, // <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2266190386U, // <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2634984758U, // <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS + 2652901632U, // <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5> + 1192448824U, // <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1213902134U, // <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 1213902135U, // <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS + 1583808614U, // <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2718839290U, // <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3> + 2670823965U, // <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6> + 1583811894U, // <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2724147961U, // <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6> + 363253046U, // <u,6,6,6>: Cost 1 vspltisw2 RHS + 1229172022U, // <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS + 363253046U, // <u,6,6,u>: Cost 1 vspltisw2 RHS + 499458150U, // <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1573200692U, // <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1573201512U, // <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573202070U, // <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499461673U, // <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1573203972U, // <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1235817272U, // <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 162073910U, // <u,6,7,7>: Cost 1 vmrglw RHS, RHS + 162073911U, // <u,6,7,u>: Cost 1 vmrglw RHS, RHS + 499466342U, // <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631827758U, // <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 1573209704U, // <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573210262U, // <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499469866U, // <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631828122U, // <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 363253046U, // <u,6,u,6>: Cost 1 vspltisw2 RHS + 135540022U, // <u,6,u,7>: Cost 1 vmrglw LHS, RHS + 135540023U, // <u,6,u,u>: Cost 1 vmrglw LHS, RHS + 1638465536U, // <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564723814U, // <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712207533U, // <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712207612U, // <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638465874U, // <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1579192580U, // <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0> + 2712207862U, // <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2316137282U, // <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7> + 564724381U, // <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 1189467130U, // <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 1638466356U, // <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638466454U, // <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 2311500282U, // <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3> + 1189467494U, // <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2712208495U, // <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2694956302U, // <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7> + 1189467756U, // <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1638466940U, // <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712208829U, // <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712208927U, // <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638467176U, // <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638467238U, // <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712209165U, // <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712209256U, // <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1627187175U, // <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7> + 2324116290U, // <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7> + 1628514441U, // <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7> + 1638467734U, // <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712209638U, // <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2700929387U, // <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u> + 1638467996U, // <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638468098U, // <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712210002U, // <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 1585189856U, // <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3> + 1257018178U, // <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1638468382U, // <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638468498U, // <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712210378U, // <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712210485U, // <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712210564U, // <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638468816U, // <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564727112U, // <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712210809U, // <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2712210888U, // <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0> + 564727337U, // <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 1192449018U, // <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2714201743U, // <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712211198U, // <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2311533050U, // <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3> + 1192449382U, // <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 1638469636U, // <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638469730U, // <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 1192449644U, // <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1638469892U, // <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712211745U, // <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712211879U, // <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638470138U, // <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712212018U, // <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712212109U, // <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712212203U, // <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638470456U, // <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638470478U, // <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638470559U, // <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1> + 1235816546U, // <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309558371U, // <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1> + 2641045434U, // <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7> + 1235816954U, // <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1235816550U, // <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309558375U, // <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5> + 1585222628U, // <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7> + 430361910U, // <u,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <u,7,7,u>: Cost 1 vspltisw3 RHS + 1638471379U, // <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564729646U, // <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638471557U, // <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638471612U, // <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638471743U, // <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564730010U, // <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638471888U, // <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 430361910U, // <u,7,u,7>: Cost 1 vspltisw3 RHS + 564730213U, // <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 202162278U, // <u,u,0,0>: Cost 1 vspltisw0 LHS + 538189985U, // <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685673645U, // <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 1215848604U, // <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 1611931986U, // <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 1579266317U, // <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0> + 2289592861U, // <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6> + 1215851848U, // <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 538190493U, // <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 1549411025U, // <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1> + 115726126U, // <u,u,1,1>: Cost 1 vmrghw LHS, LHS + 604862254U, // <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 1213866140U, // <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 1549413686U, // <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS + 115726490U, // <u,u,1,5>: Cost 1 vmrghw LHS, RHS + 1585247207U, // <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1> + 1213869384U, // <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 604862308U, // <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 1567334502U, // <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS + 1190180654U, // <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 336380006U, // <u,u,2,2>: Cost 1 vspltisw2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1567337782U, // <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS + 1190181018U, // <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 1611933626U, // <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1226485064U, // <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 475685587U, // <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1209239278U, // <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1> + 1209239765U, // <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135495836U, // <u,u,3,3>: Cost 1 vmrglw LHS, LHS + 475688246U, // <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1209239282U, // <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5> + 1209240093U, // <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135499080U, // <u,u,3,7>: Cost 1 vmrglw LHS, RHS + 135495841U, // <u,u,3,u>: Cost 1 vmrglw LHS, LHS + 1555406950U, // <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS + 1555408301U, // <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4> + 2289625301U, // <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2> + 1215881372U, // <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 229035318U, // <u,u,4,4>: Cost 1 vspltisw0 RHS + 538193206U, // <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2289625629U, // <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6> + 1215884616U, // <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 538193449U, // <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 1549443797U, // <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5> + 118708014U, // <u,u,5,1>: Cost 1 vmrghw RHS, LHS + 1561389191U, // <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5> + 1213898908U, // <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 1549446454U, // <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS + 118708378U, // <u,u,5,5>: Cost 1 vmrghw RHS, RHS + 604862618U, // <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 1213902152U, // <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 604862636U, // <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 1567367270U, // <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS + 1192892206U, // <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 1638478330U, // <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1679046864U, // <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 1567370550U, // <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS + 1192892570U, // <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 363253046U, // <u,u,6,6>: Cost 1 vspltisw2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 499605606U, // <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1235812425U, // <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1561405577U, // <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7> + 162070684U, // <u,u,7,3>: Cost 1 vmrglw RHS, LHS + 499609147U, // <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1235812753U, // <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235814941U, // <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 162073928U, // <u,u,7,7>: Cost 1 vmrglw RHS, RHS + 162070689U, // <u,u,7,u>: Cost 1 vmrglw RHS, LHS + 475726552U, // <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 538195758U, // <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 604862821U, // <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 475729206U, // <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 538196122U, // <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 604862861U, // <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp new file mode 100644 index 0000000..934bdf6 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -0,0 +1,1041 @@ +//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "PPCRegisterInfo.h" +#include "PPC.h" +#include "PPCFrameLowering.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <cstdlib> + +using namespace llvm; + +#define DEBUG_TYPE "reginfo" + +#define GET_REGINFO_TARGET_DESC +#include "PPCGenRegisterInfo.inc" + +static cl::opt<bool> +EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + +static cl::opt<bool> +AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false), + cl::desc("Force the use of a base pointer in every function")); + +PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) + : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR, + TM.isPPC64() ? 0 : 1, + TM.isPPC64() ? 0 : 1), + TM(TM) { + ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX; + ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX; + ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX; + ImmToIdxMap[PPC::LWZ] = PPC::LWZX; ImmToIdxMap[PPC::LWA] = PPC::LWAX; + ImmToIdxMap[PPC::LFS] = PPC::LFSX; ImmToIdxMap[PPC::LFD] = PPC::LFDX; + ImmToIdxMap[PPC::STH] = PPC::STHX; ImmToIdxMap[PPC::STW] = PPC::STWX; + ImmToIdxMap[PPC::STFS] = PPC::STFSX; ImmToIdxMap[PPC::STFD] = PPC::STFDX; + ImmToIdxMap[PPC::ADDI] = PPC::ADD4; + ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32; + + // 64-bit + ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8; + ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8; + ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8; + ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX; + ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; +} + +/// getPointerRegClass - Return the register class to use to hold pointers. +/// This is used for addressing modes. +const TargetRegisterClass * +PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value + // when it checks for ZERO folding. + if (Kind == 1) { + if (TM.isPPC64()) + return &PPC::G8RC_NOX0RegClass; + return &PPC::GPRC_NOR0RegClass; + } + + if (TM.isPPC64()) + return &PPC::G8RCRegClass; + return &PPC::GPRCRegClass; +} + +const MCPhysReg* +PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>(); + if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) { + if (Subtarget.hasVSX()) + return CSR_64_AllRegs_VSX_SaveList; + if (Subtarget.hasAltivec()) + return CSR_64_AllRegs_Altivec_SaveList; + return CSR_64_AllRegs_SaveList; + } + + if (Subtarget.isDarwinABI()) + return TM.isPPC64() + ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_SaveList + : CSR_Darwin64_SaveList) + : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList + : CSR_Darwin32_SaveList); + + // On PPC64, we might need to save r2 (but only if it is not reserved). + bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2); + + return TM.isPPC64() + ? (Subtarget.hasAltivec() + ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList + : CSR_SVR464_Altivec_SaveList) + : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList)) + : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList + : CSR_SVR432_SaveList); +} + +const uint32_t * +PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + if (CC == CallingConv::AnyReg) { + if (Subtarget.hasVSX()) + return CSR_64_AllRegs_VSX_RegMask; + if (Subtarget.hasAltivec()) + return CSR_64_AllRegs_Altivec_RegMask; + return CSR_64_AllRegs_RegMask; + } + + if (Subtarget.isDarwinABI()) + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_RegMask + : CSR_Darwin64_RegMask) + : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask + : CSR_Darwin32_RegMask); + + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask + : CSR_SVR464_RegMask) + : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask + : CSR_SVR432_RegMask); +} + +const uint32_t* +PPCRegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + +void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { + for (unsigned PseudoReg : {PPC::ZERO, PPC::ZERO8, PPC::RM}) + Mask[PseudoReg / 32] &= ~(1u << (PseudoReg % 32)); +} + +BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const PPCFrameLowering *TFI = getFrameLowering(MF); + + // The ZERO register is not really a register, but the representation of r0 + // when used in instructions that treat r0 as the constant 0. + Reserved.set(PPC::ZERO); + Reserved.set(PPC::ZERO8); + + // The FP register is also not really a register, but is the representation + // of the frame pointer register used by ISD::FRAMEADDR. + Reserved.set(PPC::FP); + Reserved.set(PPC::FP8); + + // The BP register is also not really a register, but is the representation + // of the base pointer register used by setjmp. + Reserved.set(PPC::BP); + Reserved.set(PPC::BP8); + + // The counter registers must be reserved so that counter-based loops can + // be correctly formed (and the mtctr instructions are not DCE'd). + Reserved.set(PPC::CTR); + Reserved.set(PPC::CTR8); + + Reserved.set(PPC::R1); + Reserved.set(PPC::LR); + Reserved.set(PPC::LR8); + Reserved.set(PPC::RM); + + if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec()) + Reserved.set(PPC::VRSAVE); + + // The SVR4 ABI reserves r2 and r13 + if (Subtarget.isSVR4ABI()) { + Reserved.set(PPC::R2); // System-reserved register + Reserved.set(PPC::R13); // Small Data Area pointer register + } + + // On PPC64, r13 is the thread pointer. Never allocate this register. + if (TM.isPPC64()) { + Reserved.set(PPC::R13); + + Reserved.set(PPC::X1); + Reserved.set(PPC::X13); + + if (TFI->needsFP(MF)) + Reserved.set(PPC::X31); + + if (hasBasePointer(MF)) + Reserved.set(PPC::X30); + + // The 64-bit SVR4 ABI reserves r2 for the TOC pointer. + if (Subtarget.isSVR4ABI()) { + // We only reserve r2 if we need to use the TOC pointer. If we have no + // explicit uses of the TOC pointer (meaning we're a leaf function with + // no constant-pool loads, etc.) and we have no potential uses inside an + // inline asm block, then we can treat r2 has an ordinary callee-saved + // register. + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm()) + Reserved.set(PPC::X2); + else + Reserved.reset(PPC::R2); + } + } + + if (TFI->needsFP(MF)) + Reserved.set(PPC::R31); + + if (hasBasePointer(MF)) { + if (Subtarget.isSVR4ABI() && !TM.isPPC64() && + TM.getRelocationModel() == Reloc::PIC_) + Reserved.set(PPC::R29); + else + Reserved.set(PPC::R30); + } + + if (Subtarget.isSVR4ABI() && !TM.isPPC64() && + TM.getRelocationModel() == Reloc::PIC_) + Reserved.set(PPC::R30); + + // Reserve Altivec registers when Altivec is unavailable. + if (!Subtarget.hasAltivec()) + for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(), + IE = PPC::VRRCRegClass.end(); I != IE; ++I) + Reserved.set(*I); + + return Reserved; +} + +unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const PPCFrameLowering *TFI = getFrameLowering(MF); + const unsigned DefaultSafety = 1; + + switch (RC->getID()) { + default: + return 0; + case PPC::G8RC_NOX0RegClassID: + case PPC::GPRC_NOR0RegClassID: + case PPC::G8RCRegClassID: + case PPC::GPRCRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 32 - FP - DefaultSafety; + } + case PPC::F8RCRegClassID: + case PPC::F4RCRegClassID: + case PPC::QFRCRegClassID: + case PPC::QSRCRegClassID: + case PPC::QBRCRegClassID: + case PPC::VRRCRegClassID: + case PPC::VFRCRegClassID: + case PPC::VSLRCRegClassID: + case PPC::VSHRCRegClassID: + return 32 - DefaultSafety; + case PPC::VSRCRegClassID: + case PPC::VSFRCRegClassID: + case PPC::VSSRCRegClassID: + return 64 - DefaultSafety; + case PPC::CRRCRegClassID: + return 8 - DefaultSafety; + } +} + +const TargetRegisterClass * +PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + if (Subtarget.hasVSX()) { + // With VSX, we can inflate various sub-register classes to the full VSX + // register set. + + if (RC == &PPC::F8RCRegClass) + return &PPC::VSFRCRegClass; + else if (RC == &PPC::VRRCRegClass) + return &PPC::VSRCRegClass; + else if (RC == &PPC::F4RCRegClass && Subtarget.hasP8Vector()) + return &PPC::VSSRCRegClass; + } + + return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +/// lowerDynamicAlloc - Generate the code for allocating an object in the +/// current frame. The sequence of code will be in the general form +/// +/// addi R0, SP, \#frameSize ; get the address of the previous frame +/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size +/// addi Rnew, SP, \#maxCalFrameSize ; get the top of the allocation +/// +void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + // Determine whether 64-bit pointers are used. + bool LP64 = TM.isPPC64(); + DebugLoc dl = MI.getDebugLoc(); + + // Get the maximum call stack size. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + // Get the total frame size. + unsigned FrameSize = MFI->getStackSize(); + + // Get stack alignments. + const PPCFrameLowering *TFI = getFrameLowering(MF); + unsigned TargetAlign = TFI->getStackAlignment(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert((maxCallFrameSize & (MaxAlign-1)) == 0 && + "Maximum call-frame size not sufficiently aligned"); + + // Determine the previous frame's address. If FrameSize can't be + // represented as 16 bits or we need special alignment, then we load the + // previous frame's address from 0(SP). Why not do an addis of the hi? + // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. + // Constructing the constant and adding would take 3 instructions. + // Fortunately, a frame greater than 32K is rare. + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) + .addReg(PPC::R31) + .addImm(FrameSize); + } else if (LP64) { + BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg) + .addImm(0) + .addReg(PPC::X1); + } else { + BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg) + .addImm(0) + .addReg(PPC::R1); + } + + bool KillNegSizeReg = MI.getOperand(1).isKill(); + unsigned NegSizeReg = MI.getOperand(1).getReg(); + + // Grow the stack and update the stack pointer link, then determine the + // address of new allocated space. + if (LP64) { + if (MaxAlign > TargetAlign) { + unsigned UnalNegSizeReg = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC); + + // Unfortunately, there is no andi, only andi., and we can't insert that + // here because we might clobber cr0 while it is live. + BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg) + .addImm(~(MaxAlign-1)); + + unsigned NegSizeReg1 = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC); + BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg) + .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg)) + .addReg(NegSizeReg1, RegState::Kill); + KillNegSizeReg = true; + } + + BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1) + .addReg(Reg, RegState::Kill) + .addReg(PPC::X1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg()) + .addReg(PPC::X1) + .addImm(maxCallFrameSize); + } else { + if (MaxAlign > TargetAlign) { + unsigned UnalNegSizeReg = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC); + + // Unfortunately, there is no andi, only andi., and we can't insert that + // here because we might clobber cr0 while it is live. + BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg) + .addImm(~(MaxAlign-1)); + + unsigned NegSizeReg1 = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC); + BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg) + .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg)) + .addReg(NegSizeReg1, RegState::Kill); + KillNegSizeReg = true; + } + + BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1) + .addReg(Reg, RegState::Kill) + .addReg(PPC::R1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg()) + .addReg(PPC::R1) + .addImm(maxCallFrameSize); + } + + // Discard the DYNALLOC instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerDynamicAreaOffset( + MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + DebugLoc dl = MI.getDebugLoc(); + BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + .addImm(maxCallFrameSize); + MBB.erase(II); +} + +/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of +/// reserving a whole register (R0), we scrounge for one here. This generates +/// code like this: +/// +/// mfcr rA ; Move the conditional register into GPR rA. +/// rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot. +/// stw rA, FI ; Store rA to the frame. +/// +void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_CR <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = TM.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + // We need to store the CR in the low 4-bits of the saved value. First, issue + // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + // If the saved register wasn't CR0, shift the bits left so that they are in + // CR0's slot. + if (SrcReg != PPC::CR0) { + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + // rlwinm rA, rA, ShiftBits, 0, 31. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill) + .addImm(getEncodingValue(SrcReg) * 4) + .addImm(0) + .addImm(31); + } + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW)) + .addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CR <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = TM.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_CR does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ), + Reg), FrameIndex); + + // If the reloaded register isn't CR0, shift the bits right so that they are + // in the right CR's slot. + if (DestReg != PPC::CR0) { + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + unsigned ShiftBits = getEncodingValue(DestReg)*4; + // rlwinm r11, r11, 32-ShiftBits, 0, 31. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0) + .addImm(31); + } + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg) + .addReg(Reg, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_CRBIT <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = TM.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL), + getCRFromCRBit(SrcReg)) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) + .addReg(getCRFromCRBit(SrcReg)); + + // If the saved register wasn't CR0LT, shift the bits left so that the bit to + // store is the first one. Mask all but that bit. + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + // rlwinm rA, rA, ShiftBits, 0, 0. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill) + .addImm(getEncodingValue(SrcReg)) + .addImm(0).addImm(0); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW)) + .addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CRBIT <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = TM.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_CRBIT does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ), + Reg), FrameIndex); + + BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg); + + unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO) + .addReg(getCRFromCRBit(DestReg)); + + unsigned ShiftBits = getEncodingValue(DestReg); + // rlwimi r11, r10, 32-ShiftBits, ..., ... + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO) + .addReg(RegO, RegState::Kill) + .addReg(Reg, RegState::Kill) + .addImm(ShiftBits ? 32 - ShiftBits : 0) + .addImm(ShiftBits) + .addImm(ShiftBits); + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), + getCRFromCRBit(DestReg)) + .addReg(RegO, RegState::Kill) + // Make sure we have a use dependency all the way through this + // sequence of instructions. We can't have the other bits in the CR + // modified in between the mfocrf and the mtocrf. + .addReg(getCRFromCRBit(DestReg), RegState::Implicit); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + addFrameReference( + BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_VRSAVE does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ), + Reg), FrameIndex); + + BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg) + .addReg(Reg, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4 + // ABI, return true to prevent allocating an additional frame slot. + // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0 + // is arbitrary and will be subsequently ignored. For 32-bit, we have + // previously created the stack slot if needed, so return its FrameIdx. + if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) { + if (TM.isPPC64()) + FrameIdx = 0; + else { + const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + FrameIdx = FI->getCRSpillFrameIndex(); + } + return true; + } + return false; +} + +// Figure out if the offset in the instruction must be a multiple of 4. +// This is true for instructions like "STD". +static bool usesIXAddr(const MachineInstr &MI) { + unsigned OpC = MI.getOpcode(); + + switch (OpC) { + default: + return false; + case PPC::LWA: + case PPC::LWA_32: + case PPC::LD: + case PPC::STD: + return true; + } +} + +// Return the OffsetOperandNo given the FIOperandNum (and the instruction). +static unsigned getOffsetONFromFION(const MachineInstr &MI, + unsigned FIOperandNum) { + // Take into account whether it's an add or mem instruction + unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2; + if (MI.isInlineAsm()) + OffsetOperandNo = FIOperandNum - 1; + else if (MI.getOpcode() == TargetOpcode::STACKMAP || + MI.getOpcode() == TargetOpcode::PATCHPOINT) + OffsetOperandNo = FIOperandNum + 1; + + return OffsetOperandNo; +} + +void +PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + DebugLoc dl = MI.getDebugLoc(); + + unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); + + // Get the frame index. + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + + // Get the frame pointer save index. Users of this index are primarily + // DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + // Get the instruction opcode. + unsigned OpC = MI.getOpcode(); + + if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) { + lowerDynamicAreaOffset(II); + return; + } + + // Special case for dynamic alloca. + if (FPSI && FrameIndex == FPSI && + (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { + lowerDynamicAlloc(II); + return; + } + + // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc. + if (OpC == PPC::SPILL_CR) { + lowerCRSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_CR) { + lowerCRRestore(II, FrameIndex); + return; + } else if (OpC == PPC::SPILL_CRBIT) { + lowerCRBitSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_CRBIT) { + lowerCRBitRestore(II, FrameIndex); + return; + } else if (OpC == PPC::SPILL_VRSAVE) { + lowerVRSAVESpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_VRSAVE) { + lowerVRSAVERestore(II, FrameIndex); + return; + } + + // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP). + MI.getOperand(FIOperandNum).ChangeToRegister( + FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false); + + // Figure out if the offset in the instruction is shifted right two bits. + bool isIXAddr = usesIXAddr(MI); + + // If the instruction is not present in ImmToIdxMap, then it has no immediate + // form (and must be r+r). + bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && + OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC); + + // Now add the frame object offset to the offset from r1. + int Offset = MFI->getObjectOffset(FrameIndex); + Offset += MI.getOperand(OffsetOperandNo).getImm(); + + // If we're not using a Frame Pointer that has been set to the value of the + // SP before having the stack size subtracted from it, then add the stack size + // to Offset to get the correct offset. + // Naked functions have stack size 0, although getStackSize may not reflect + // that because we didn't call all the pieces that compute it for naked + // functions. + if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) { + if (!(hasBasePointer(MF) && FrameIndex < 0)) + Offset += MFI->getStackSize(); + } + + // If we can, encode the offset directly into the instruction. If this is a + // normal PPC "ri" instruction, any 16-bit value can be safely encoded. If + // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits + // clear can be encoded. This is extremely uncommon, because normally you + // only "std" to a stack slot that is at least 4-byte aligned, but it can + // happen in invalid code. + assert(OpC != PPC::DBG_VALUE && + "This should be handled in a target-independent way"); + if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) || + OpC == TargetOpcode::STACKMAP || + OpC == TargetOpcode::PATCHPOINT)) { + MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + return; + } + + // The offset doesn't fit into a single register, scavenge one to build the + // offset in. + + bool is64Bit = TM.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC; + unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC), + SReg = MF.getRegInfo().createVirtualRegister(RC); + + // Insert a set of rA with the full offset value before the ld, st, or add + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi) + .addImm(Offset >> 16); + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg) + .addReg(SRegHi, RegState::Kill) + .addImm(Offset); + + // Convert into indexed form of the instruction: + // + // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0 + // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0 + unsigned OperandBase; + + if (noImmForm) + OperandBase = 1; + else if (OpC != TargetOpcode::INLINEASM) { + assert(ImmToIdxMap.count(OpC) && + "No indexed form of load or store available!"); + unsigned NewOpcode = ImmToIdxMap.find(OpC)->second; + MI.setDesc(TII.get(NewOpcode)); + OperandBase = 1; + } else { + OperandBase = OffsetOperandNo; + } + + unsigned StackReg = MI.getOperand(FIOperandNum).getReg(); + MI.getOperand(OperandBase).ChangeToRegister(StackReg, false); + MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true); +} + +unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const PPCFrameLowering *TFI = getFrameLowering(MF); + + if (!TM.isPPC64()) + return TFI->hasFP(MF) ? PPC::R31 : PPC::R1; + else + return TFI->hasFP(MF) ? PPC::X31 : PPC::X1; +} + +unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + if (!hasBasePointer(MF)) + return getFrameRegister(MF); + + if (TM.isPPC64()) + return PPC::X30; + + if (Subtarget.isSVR4ABI() && + TM.getRelocationModel() == Reloc::PIC_) + return PPC::R29; + + return PPC::R30; +} + +bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + if (!EnableBasePointer) + return false; + if (AlwaysBasePointer) + return true; + + // If we need to realign the stack, then the stack pointer can no longer + // serve as an offset into the caller's stack space. As a result, we need a + // base pointer. + return needsStackRealignment(MF); +} + +/// Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool PPCRegisterInfo:: +needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + assert(Offset < 0 && "Local offset must be negative"); + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores that have + // an r+i form. Return false for everything else. + unsigned OpC = MI->getOpcode(); + if (!ImmToIdxMap.count(OpC)) + return false; + + // Don't generate a new virtual base register just to add zero to it. + if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) && + MI->getOperand(2).getImm() == 0) + return false; + + MachineBasicBlock &MBB = *MI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCFrameLowering *TFI = getFrameLowering(MF); + unsigned StackEst = TFI->determineFrameLayout(MF, false, true); + + // If we likely don't need a stack frame, then we probably don't need a + // virtual base register either. + if (!StackEst) + return false; + + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset += StackEst; + + // The frame pointer will point to the end of the stack, so estimate the + // offset as the difference between the object offset and the FP location. + return !isFrameOffsetLegal(MI, getBaseRegister(MF), Offset); +} + +/// Insert defining instruction(s) for BaseReg to +/// be a pointer to FrameIdx at the beginning of the basic block. +void PPCRegisterInfo:: +materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const { + unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI; + + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + const MachineFunction &MF = *MBB->getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const MCInstrDesc &MCID = TII.get(ADDriOpc); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); + + BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx).addImm(Offset); +} + +void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + unsigned FIOperandNum = 0; + while (!MI.getOperand(FIOperandNum).isFI()) { + ++FIOperandNum; + assert(FIOperandNum < MI.getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false); + unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); + Offset += MI.getOperand(OffsetOperandNo).getImm(); + MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const MCInstrDesc &MCID = MI.getDesc(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.constrainRegClass(BaseReg, + TII.getRegClass(MCID, FIOperandNum, this, MF)); +} + +bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, + int64_t Offset) const { + unsigned FIOperandNum = 0; + while (!MI->getOperand(FIOperandNum).isFI()) { + ++FIOperandNum; + assert(FIOperandNum < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum); + Offset += MI->getOperand(OffsetOperandNo).getImm(); + + return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm + MI->getOpcode() == TargetOpcode::STACKMAP || + MI->getOpcode() == TargetOpcode::PATCHPOINT || + (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h new file mode 100644 index 0000000..b15fde8 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -0,0 +1,144 @@ +//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H +#define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H + +#include "PPC.h" +#include "llvm/ADT/DenseMap.h" + +#define GET_REGINFO_HEADER +#include "PPCGenRegisterInfo.inc" + +namespace llvm { + +inline static unsigned getCRFromCRBit(unsigned SrcReg) { + unsigned Reg = 0; + if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT || + SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN) + Reg = PPC::CR0; + else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT || + SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN) + Reg = PPC::CR1; + else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT || + SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN) + Reg = PPC::CR2; + else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT || + SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN) + Reg = PPC::CR3; + else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT || + SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN) + Reg = PPC::CR4; + else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT || + SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN) + Reg = PPC::CR5; + else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT || + SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN) + Reg = PPC::CR6; + else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT || + SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN) + Reg = PPC::CR7; + + assert(Reg != 0 && "Invalid CR bit register"); + return Reg; +} + +class PPCRegisterInfo : public PPCGenRegisterInfo { + DenseMap<unsigned, unsigned> ImmToIdxMap; + const PPCTargetMachine &TM; + +public: + PPCRegisterInfo(const PPCTargetMachine &TM); + + /// getPointerRegClass - Return the register class to use to hold pointers. + /// This is used for addressing modes. + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; + + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + + /// Code Generation virtual methods... + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const override; + const uint32_t *getNoPreservedMask() const override; + + void adjustStackMapLiveOutMask(uint32_t *Mask) const override; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + /// We require the register scavenger. + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { + return true; + } + + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { + return true; + } + + bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override { + return true; + } + + void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const; + void lowerCRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRBitSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRBitRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerVRSAVESpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerVRSAVERestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + + bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + int &FrameIdx) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + + // Support for virtual base registers. + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const override; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const override; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + + // Base pointer (stack realignment) support. + unsigned getBaseRegister(const MachineFunction &MF) const; + bool hasBasePointer(const MachineFunction &MF) const; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td new file mode 100644 index 0000000..e5f363c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -0,0 +1,363 @@ +//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +let Namespace = "PPC" in { +def sub_lt : SubRegIndex<1>; +def sub_gt : SubRegIndex<1, 1>; +def sub_eq : SubRegIndex<1, 2>; +def sub_un : SubRegIndex<1, 3>; +def sub_32 : SubRegIndex<32>; +def sub_64 : SubRegIndex<64>; +def sub_128 : SubRegIndex<128>; +} + + +class PPCReg<string n> : Register<n> { + let Namespace = "PPC"; +} + +// We identify all our registers with a 5-bit ID, for consistency's sake. + +// GPR - One of the 32 32-bit general-purpose registers +class GPR<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// GP8 - One of the 32 64-bit general-purpose registers +class GP8<GPR SubReg, string n> : PPCReg<n> { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_32]; +} + +// SPR - One of the 32-bit special-purpose registers +class SPR<bits<10> num, string n> : PPCReg<n> { + let HWEncoding{9-0} = num; +} + +// FPR - One of the 32 64-bit floating-point registers +class FPR<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) +class QFPR<FPR SubReg, string n> : PPCReg<n> { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + +// VF - One of the 32 64-bit floating-point subregisters of the vector +// registers (used by VSX). +class VF<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; + let HWEncoding{5} = 1; +} + +// VR - One of the 32 128-bit vector registers +class VR<VF SubReg, string n> : PPCReg<n> { + let HWEncoding{4-0} = SubReg.HWEncoding{4-0}; + let HWEncoding{5} = 0; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + +// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar +// floating-point registers. +class VSRL<FPR SubReg, string n> : PPCReg<n> { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + +// VSRH - One of the 32 128-bit VSX registers that overlap with the vector +// registers. +class VSRH<VR SubReg, string n> : PPCReg<n> { + let HWEncoding{4-0} = SubReg.HWEncoding{4-0}; + let HWEncoding{5} = 1; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_128]; +} + +// CR - One of the 8 4-bit condition registers +class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + +// CRBIT - One of the 32 1-bit condition register fields +class CRBIT<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// General-purpose registers +foreach Index = 0-31 in { + def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>; +} + +// 64-bit General-purpose registers +foreach Index = 0-31 in { + def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>, + DwarfRegNum<[Index, -2]>; +} + +// Floating-point registers +foreach Index = 0-31 in { + def F#Index : FPR<Index, "f"#Index>, + DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; +} + +// Floating-point vector subregisters (for VSX) +foreach Index = 0-31 in { + def VF#Index : VF<Index, "vs" # !add(Index, 32)>; +} + +// QPX Floating-point registers +foreach Index = 0-31 in { + def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>, + DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; +} + +// Vector registers +foreach Index = 0-31 in { + def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>, + DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; +} + +// VSX registers +foreach Index = 0-31 in { + def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>, + DwarfRegAlias<!cast<FPR>("F"#Index)>; +} +foreach Index = 0-31 in { + def VSH#Index : VSRH<!cast<VR>("V"#Index), "vs" # !add(Index, 32)>, + DwarfRegAlias<!cast<VR>("V"#Index)>; +} + +// The reprsentation of r0 when treated as the constant 0. +def ZERO : GPR<0, "0">, DwarfRegAlias<R0>; +def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>; + +// Representations of the frame pointer used by ISD::FRAMEADDR. +def FP : GPR<0 /* arbitrary */, "**FRAME POINTER**">; +def FP8 : GP8<FP, "**FRAME POINTER**">; + +// Representations of the base pointer used by setjmp. +def BP : GPR<0 /* arbitrary */, "**BASE POINTER**">; +def BP8 : GP8<BP, "**BASE POINTER**">; + +// Condition register bits +def CR0LT : CRBIT< 0, "0">; +def CR0GT : CRBIT< 1, "1">; +def CR0EQ : CRBIT< 2, "2">; +def CR0UN : CRBIT< 3, "3">; +def CR1LT : CRBIT< 4, "4">; +def CR1GT : CRBIT< 5, "5">; +def CR1EQ : CRBIT< 6, "6">; +def CR1UN : CRBIT< 7, "7">; +def CR2LT : CRBIT< 8, "8">; +def CR2GT : CRBIT< 9, "9">; +def CR2EQ : CRBIT<10, "10">; +def CR2UN : CRBIT<11, "11">; +def CR3LT : CRBIT<12, "12">; +def CR3GT : CRBIT<13, "13">; +def CR3EQ : CRBIT<14, "14">; +def CR3UN : CRBIT<15, "15">; +def CR4LT : CRBIT<16, "16">; +def CR4GT : CRBIT<17, "17">; +def CR4EQ : CRBIT<18, "18">; +def CR4UN : CRBIT<19, "19">; +def CR5LT : CRBIT<20, "20">; +def CR5GT : CRBIT<21, "21">; +def CR5EQ : CRBIT<22, "22">; +def CR5UN : CRBIT<23, "23">; +def CR6LT : CRBIT<24, "24">; +def CR6GT : CRBIT<25, "25">; +def CR6EQ : CRBIT<26, "26">; +def CR6UN : CRBIT<27, "27">; +def CR7LT : CRBIT<28, "28">; +def CR7GT : CRBIT<29, "29">; +def CR7EQ : CRBIT<30, "30">; +def CR7UN : CRBIT<31, "31">; + +// Condition registers +let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in { +def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>; +def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>; +def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>; +def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>; +def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>; +def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>; +def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>; +def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>; +} + +// Link register +def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>; +//let Aliases = [LR] in +def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>; + +// Count register +def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>; + +// VRsave register +def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>; + +// Carry bit. In the architecture this is really bit 0 of the XER register +// (which really is SPR register 1); this is the only bit interesting to a +// compiler. +def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>; + +// FP rounding mode: bits 30 and 31 of the FP status and control register +// This is not allocated as a normal register; it appears only in +// Uses and Defs. The ABI says it needs to be preserved by a function, +// but this is not achieved by saving and restoring it as with +// most registers, it has to be done in code; to make this work all the +// return and call instructions are described as Uses of RM, so instructions +// that do nothing but change RM will not get deleted. +def RM: PPCReg<"**ROUNDING MODE**">; + +/// Register classes +// Allocate volatiles first +// then nonvolatiles in reverse order since stmw/lmw save from rN to r31 +def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12), + (sequence "R%u", 30, 13), + R31, R0, R1, FP, BP)> { + // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so + // put it at the end of the list. + let AltOrders = [(add (sub GPRC, R2), R2)]; + let AltOrderSelect = [{ + const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); + return S.isPPC64() && S.isSVR4ABI(); + }]; +} + +def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12), + (sequence "X%u", 30, 14), + X31, X13, X0, X1, FP8, BP8)> { + // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so + // put it at the end of the list. + let AltOrders = [(add (sub G8RC, X2), X2)]; + let AltOrderSelect = [{ + const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); + return S.isPPC64() && S.isSVR4ABI(); + }]; +} + +// For some instructions r0 is special (representing the value 0 instead of +// the value in the r0 register), and we use these register subclasses to +// prevent r0 from being allocated for use by those instructions. +def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> { + // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so + // put it at the end of the list. + let AltOrders = [(add (sub GPRC_NOR0, R2), R2)]; + let AltOrderSelect = [{ + const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); + return S.isPPC64() && S.isSVR4ABI(); + }]; +} + +def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> { + // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so + // put it at the end of the list. + let AltOrders = [(add (sub G8RC_NOX0, X2), X2)]; + let AltOrderSelect = [{ + const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>(); + return S.isPPC64() && S.isSVR4ABI(); + }]; +} + +// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4 +// ABI the size of the Floating-point register save area is determined by the +// allocated non-volatile register with the lowest register number, as FP +// register N is spilled to offset 8 * (32 - N) below the back chain word of the +// previous stack frame. By allocating non-volatiles in reverse order we make +// sure that the Floating-point register save area is always as small as +// possible because there aren't any unused spill slots. +def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13), + (sequence "F%u", 31, 14))>; +def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>; + +def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32], 128, + (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11, + V12, V13, V14, V15, V16, V17, V18, V19, V31, V30, + V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>; + +// VSX register classes (the allocation order mirrors that of the corresponding +// subregister classes). +def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add (sequence "VSL%u", 0, 13), + (sequence "VSL%u", 31, 14))>; +def VSHRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add VSH2, VSH3, VSH4, VSH5, VSH0, VSH1, VSH6, VSH7, + VSH8, VSH9, VSH10, VSH11, VSH12, VSH13, VSH14, + VSH15, VSH16, VSH17, VSH18, VSH19, VSH31, VSH30, + VSH29, VSH28, VSH27, VSH26, VSH25, VSH24, VSH23, + VSH22, VSH21, VSH20)>; +def VSRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add VSLRC, VSHRC)>; + +// Register classes for the 64-bit "scalar" VSX subregisters. +def VFRC : RegisterClass<"PPC", [f64], 64, + (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7, + VF8, VF9, VF10, VF11, VF12, VF13, VF14, + VF15, VF16, VF17, VF18, VF19, VF31, VF30, + VF29, VF28, VF27, VF26, VF25, VF24, VF23, + VF22, VF21, VF20)>; +def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>; + +// Register class for single precision scalars in VSX registers +def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; + +// For QPX +def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), + (sequence "QF%u", 31, 14))>; +def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; +def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { + // These are actually stored as floating-point values where a positive + // number is true and anything else (including NaN) is false. + let Size = 256; +} + +def CRBITRC : RegisterClass<"PPC", [i1], 32, + (add CR2LT, CR2GT, CR2EQ, CR2UN, + CR3LT, CR3GT, CR3EQ, CR3UN, + CR4LT, CR4GT, CR4EQ, CR4UN, + CR5LT, CR5GT, CR5EQ, CR5UN, + CR6LT, CR6GT, CR6EQ, CR6UN, + CR7LT, CR7GT, CR7EQ, CR7UN, + CR1LT, CR1GT, CR1EQ, CR1UN, + CR0LT, CR0GT, CR0EQ, CR0UN)> { + let Size = 32; +} + +def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, + CR7, CR2, CR3, CR4)>; + +def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>; + +// The CTR registers are not allocatable because they're used by the +// decrement-and-branch instructions, and thus need to stay live across +// multiple basic blocks. +def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> { + let isAllocatable = 0; +} +def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> { + let isAllocatable = 0; +} + +def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; +def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> { + let CopyCost = -1; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td new file mode 100644 index 0000000..d0954a1 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td @@ -0,0 +1,126 @@ +//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for PowerPC +// +def IIC_IntSimple : InstrItinClass; +def IIC_IntGeneral : InstrItinClass; +def IIC_IntCompare : InstrItinClass; +def IIC_IntISEL : InstrItinClass; +def IIC_IntDivD : InstrItinClass; +def IIC_IntDivW : InstrItinClass; +def IIC_IntMFFS : InstrItinClass; +def IIC_IntMFVSCR : InstrItinClass; +def IIC_IntMTFSB0 : InstrItinClass; +def IIC_IntMTSRD : InstrItinClass; +def IIC_IntMulHD : InstrItinClass; +def IIC_IntMulHW : InstrItinClass; +def IIC_IntMulHWU : InstrItinClass; +def IIC_IntMulLI : InstrItinClass; +def IIC_IntRFID : InstrItinClass; +def IIC_IntRotateD : InstrItinClass; +def IIC_IntRotateDI : InstrItinClass; +def IIC_IntRotate : InstrItinClass; +def IIC_IntShift : InstrItinClass; +def IIC_IntTrapD : InstrItinClass; +def IIC_IntTrapW : InstrItinClass; +def IIC_BrB : InstrItinClass; +def IIC_BrCR : InstrItinClass; +def IIC_BrMCR : InstrItinClass; +def IIC_BrMCRX : InstrItinClass; +def IIC_LdStDCBA : InstrItinClass; +def IIC_LdStDCBF : InstrItinClass; +def IIC_LdStDCBI : InstrItinClass; +def IIC_LdStLoad : InstrItinClass; +def IIC_LdStLoadUpd : InstrItinClass; +def IIC_LdStLoadUpdX : InstrItinClass; +def IIC_LdStStore : InstrItinClass; +def IIC_LdStStoreUpd : InstrItinClass; +def IIC_LdStDSS : InstrItinClass; +def IIC_LdStICBI : InstrItinClass; +def IIC_LdStLD : InstrItinClass; +def IIC_LdStLDU : InstrItinClass; +def IIC_LdStLDUX : InstrItinClass; +def IIC_LdStLDARX : InstrItinClass; +def IIC_LdStLFD : InstrItinClass; +def IIC_LdStLFDU : InstrItinClass; +def IIC_LdStLFDUX : InstrItinClass; +def IIC_LdStLHA : InstrItinClass; +def IIC_LdStLHAU : InstrItinClass; +def IIC_LdStLHAUX : InstrItinClass; +def IIC_LdStLMW : InstrItinClass; +def IIC_LdStLVecX : InstrItinClass; +def IIC_LdStLWA : InstrItinClass; +def IIC_LdStLWARX : InstrItinClass; +def IIC_LdStSLBIA : InstrItinClass; +def IIC_LdStSLBIE : InstrItinClass; +def IIC_LdStSTD : InstrItinClass; +def IIC_LdStSTDCX : InstrItinClass; +def IIC_LdStSTDU : InstrItinClass; +def IIC_LdStSTDUX : InstrItinClass; +def IIC_LdStSTFD : InstrItinClass; +def IIC_LdStSTFDU : InstrItinClass; +def IIC_LdStSTVEBX : InstrItinClass; +def IIC_LdStSTWCX : InstrItinClass; +def IIC_LdStSync : InstrItinClass; +def IIC_SprISYNC : InstrItinClass; +def IIC_SprMFSR : InstrItinClass; +def IIC_SprMTMSR : InstrItinClass; +def IIC_SprMTSR : InstrItinClass; +def IIC_SprTLBSYNC : InstrItinClass; +def IIC_SprMFCR : InstrItinClass; +def IIC_SprMFCRF : InstrItinClass; +def IIC_SprMFMSR : InstrItinClass; +def IIC_SprMFSPR : InstrItinClass; +def IIC_SprMFTB : InstrItinClass; +def IIC_SprMTSPR : InstrItinClass; +def IIC_SprMTSRIN : InstrItinClass; +def IIC_SprRFI : InstrItinClass; +def IIC_SprSC : InstrItinClass; +def IIC_FPGeneral : InstrItinClass; +def IIC_FPAddSub : InstrItinClass; +def IIC_FPCompare : InstrItinClass; +def IIC_FPDivD : InstrItinClass; +def IIC_FPDivS : InstrItinClass; +def IIC_FPFused : InstrItinClass; +def IIC_FPRes : InstrItinClass; +def IIC_FPSqrtD : InstrItinClass; +def IIC_FPSqrtS : InstrItinClass; +def IIC_VecGeneral : InstrItinClass; +def IIC_VecFP : InstrItinClass; +def IIC_VecFPCompare : InstrItinClass; +def IIC_VecComplex : InstrItinClass; +def IIC_VecPerm : InstrItinClass; +def IIC_VecFPRound : InstrItinClass; +def IIC_VecVSL : InstrItinClass; +def IIC_VecVSR : InstrItinClass; +def IIC_SprMTMSRD : InstrItinClass; +def IIC_SprSLIE : InstrItinClass; +def IIC_SprSLBIE : InstrItinClass; +def IIC_SprSLBMTE : InstrItinClass; +def IIC_SprSLBMFEE : InstrItinClass; +def IIC_SprSLBIA : InstrItinClass; +def IIC_SprTLBIA : InstrItinClass; +def IIC_SprTLBIEL : InstrItinClass; +def IIC_SprTLBIE : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +include "PPCScheduleG3.td" +include "PPCSchedule440.td" +include "PPCScheduleG4.td" +include "PPCScheduleG4Plus.td" +include "PPCScheduleG5.td" +include "PPCScheduleP7.td" +include "PPCScheduleP8.td" +include "PPCScheduleA2.td" +include "PPCScheduleE500mc.td" +include "PPCScheduleE5500.td" diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td new file mode 100644 index 0000000..04a43bc --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td @@ -0,0 +1,607 @@ +//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// PowerPC 440x6 Embedded Processor Core User's Manual. +// IBM (as updated in) 2010. + +// The basic PPC 440 does not include a floating-point unit; the pipeline +// timings here are constructed to match the FP2 unit shipped with the +// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers. +// References: +// S. Chatterjee, et al. Design and exploitation of a high-performance +// SIMD floating-point unit for Blue Gene/L. +// IBM J. Res. & Dev. 49 (2/3) March/May 2005. +// also: +// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution: +// Blue Gene/P Application Development. +// IBM (as updated in) 2009. + +//===----------------------------------------------------------------------===// +// Functional units on the PowerPC 440/450 chip sets +// +def P440_DISS1 : FuncUnit; // Issue unit 1 +def P440_DISS2 : FuncUnit; // Issue unit 2 +def P440_LRACC : FuncUnit; // Register access and dispatch for + // the simple integer (J-pipe) and + // load/store (L-pipe) pipelines +def P440_IRACC : FuncUnit; // Register access and dispatch for + // the complex integer (I-pipe) pipeline +def P440_FRACC : FuncUnit; // Register access and dispatch for + // the floating-point execution (F-pipe) pipeline +def P440_IEXE1 : FuncUnit; // Execution stage 1 for the I pipeline +def P440_IEXE2 : FuncUnit; // Execution stage 2 for the I pipeline +def P440_IWB : FuncUnit; // Write-back unit for the I pipeline +def P440_JEXE1 : FuncUnit; // Execution stage 1 for the J pipeline +def P440_JEXE2 : FuncUnit; // Execution stage 2 for the J pipeline +def P440_JWB : FuncUnit; // Write-back unit for the J pipeline +def P440_AGEN : FuncUnit; // Address generation for the L pipeline +def P440_CRD : FuncUnit; // D-cache access for the L pipeline +def P440_LWB : FuncUnit; // Write-back unit for the L pipeline +def P440_FEXE1 : FuncUnit; // Execution stage 1 for the F pipeline +def P440_FEXE2 : FuncUnit; // Execution stage 2 for the F pipeline +def P440_FEXE3 : FuncUnit; // Execution stage 3 for the F pipeline +def P440_FEXE4 : FuncUnit; // Execution stage 4 for the F pipeline +def P440_FEXE5 : FuncUnit; // Execution stage 5 for the F pipeline +def P440_FEXE6 : FuncUnit; // Execution stage 6 for the F pipeline +def P440_FWB : FuncUnit; // Write-back unit for the F pipeline + +def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used + // to make sure that no lwarx/stwcx. + // instructions are issued while another + // lwarx/stwcx. is in the L pipe. + +def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs. +def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs. + +// Notes: +// Instructions are held in the FRACC, LRACC and IRACC pipeline +// stages until their source operands become ready. Exceptions: +// - Store instructions will hold in the AGEN stage +// - The integer multiply-accumulate instruction will hold in +// the IEXE1 stage +// +// For most I-pipe operations, the result is available at the end of +// the IEXE1 stage. Operations such as multiply and divide must +// continue to execute in IEXE2 and IWB. Divide resides in IWB for +// 33 cycles (multiply also calculates its result in IWB). For all +// J-pipe instructions, the result is available +// at the end of the JEXE1 stage. Loads have a 3-cycle latency +// (data is not available until after the LWB stage). +// +// The L1 cache hit latency is four cycles for floating point loads +// and three cycles for integer loads. +// +// The stwcx. instruction requires both the LRACC and the IRACC +// dispatch stages. It must be issued from DISS0. +// +// All lwarx/stwcx. instructions hold in LRACC if another +// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB. +// +// msync (a.k.a. sync) and mbar will hold in LWB until all load/store +// resources are empty. AGEN and CRD are held empty until the msync/mbar +// commits. +// +// Most floating-point instructions, computational and move, +// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that +// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above, +// loads take 4 cycles (for L1 hit). + +// +// This file defines the itinerary class data for the PPC 440 processor. +// +//===----------------------------------------------------------------------===// + + +def PPC440Itineraries : ProcessorItineraries< + [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2, + P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD, + P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5, + P440_FEXE6, P440_FWB, P440_LWARX_Hold], + [P440_GPR_Bypass, P440_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass, NoBypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<33, [P440_IWB]>], + [36, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 2, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 2, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStStore, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStICBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [5, 2, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [5, 2, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHA, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLMW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<3, [P440_AGEN], 1>, + InstrStage<2, [P440_CRD], 1>, + InstrStage<1, [P440_LWB]>]>, + InstrItinData<IIC_SprISYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC], 0>, + InstrStage<1, [P440_LRACC], 0>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_FEXE1], 0>, + InstrStage<1, [P440_AGEN], 0>, + InstrStage<1, [P440_JEXE1], 0>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_FEXE2], 0>, + InstrStage<1, [P440_CRD], 0>, + InstrStage<1, [P440_JEXE2], 0>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<6, [P440_FEXE3], 0>, + InstrStage<6, [P440_LWB], 0>, + InstrStage<6, [P440_JWB], 0>, + InstrStage<6, [P440_IWB]>]>, + InstrItinData<IIC_SprMFSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [5, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprRFI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprSC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, P440_FPR_Bypass, + P440_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<25, [P440_FWB]>], + [31, 0, 0], + [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<13, [P440_FWB]>], + [19, 0, 0], + [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass, + P440_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0], + [P440_FPR_Bypass, P440_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// PPC440 machine model for scheduling and other instruction cost heuristics. + +def PPC440Model : SchedMachineModel { + let IssueWidth = 2; // 2 instructions are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 5; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPC440Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td new file mode 100644 index 0000000..21a357a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td @@ -0,0 +1,171 @@ +//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// A2 Processor User's Manual. +// IBM (as updated in) 2010. + +//===----------------------------------------------------------------------===// +// Functional units on the PowerPC A2 chip sets +// +def A2_XU : FuncUnit; // A2_XU pipeline +def A2_FU : FuncUnit; // FI pipeline + +// +// This file defines the itinerary class data for the PPC A2 processor. +// +//===----------------------------------------------------------------------===// + + +def PPCA2Itineraries : ProcessorItineraries< + [A2_XU, A2_FU], [], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [A2_XU]>], + [39, 0, 0]>, + InstrItinData<IIC_IntDivD, [InstrStage<1, [A2_XU]>], + [71, 0, 0]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntRotateD, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [A2_XU]>], + [2, 0]>, + InstrItinData<IIC_IntTrapD, [InstrStage<1, [A2_XU]>], + [2, 0]>, + InstrItinData<IIC_BrB, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLDU, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLDUX, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStStore, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStICBI, [InstrStage<1, [A2_XU]>], + [16, 0, 0]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStLFD, [InstrStage<1, [A2_XU]>], + [7, 0, 0]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [A2_XU]>], + [7, 9, 0, 0]>, + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [A2_XU]>], + [7, 9, 0, 0]>, + InstrItinData<IIC_LdStLHA, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSTD, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSync, [InstrStage<1, [A2_XU]>], + [6]>, + InstrItinData<IIC_SprISYNC, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [A2_XU]>], + [16, 0]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [A2_XU]>], + [1, 0]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [A2_XU]>], + [4, 0]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [A2_XU]>], + [4, 0]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprRFI, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_SprSC, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [A2_FU]>], + [6, 0, 0]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [A2_FU]>], + [6, 0, 0]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [A2_FU]>], + [5, 0, 0]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [A2_FU]>], + [72, 0, 0]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [A2_FU]>], + [59, 0, 0]>, + InstrItinData<IIC_FPSqrtD, [InstrStage<1, [A2_FU]>], + [69, 0, 0]>, + InstrItinData<IIC_FPSqrtS, [InstrStage<1, [A2_FU]>], + [65, 0, 0]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [A2_FU]>], + [6, 0, 0, 0]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [A2_FU]>], + [6, 0]> +]>; + +// ===---------------------------------------------------------------------===// +// A2 machine model for scheduling and other instruction cost heuristics. + +def PPCA2Model : SchedMachineModel { + let IssueWidth = 1; // 1 instruction is dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 6; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 13; + + let Itineraries = PPCA2Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td new file mode 100644 index 0000000..36b8517 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td @@ -0,0 +1,320 @@ +//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e500mc 32-bit +// Power processor. +// +// All information is derived from the "e500mc Core Reference Manual", +// Freescale Document Number E500MCRM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e500mc core: +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1 +def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2 + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// Some instructions can only execute in SFX0 but not SFX1. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is executed. +def E500_SFX0 : FuncUnit; // Simple unit 0 +def E500_SFX1 : FuncUnit; // Simple unit 1 +def E500_BU : FuncUnit; // Branch unit +def E500_CFX_DivBypass + : FuncUnit; // CFX divide bypass path +def E500_CFX_0 : FuncUnit; // CFX pipeline +def E500_LSU_0 : FuncUnit; // LSU pipeline +def E500_FPU_0 : FuncUnit; // FPU pipeline + +def E500_GPR_Bypass : Bypass; +def E500_FPR_Bypass : Bypass; +def E500_CR_Bypass : Bypass; + +def PPCE500mcItineraries : ProcessorItineraries< + [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass, + E500_CFX_0, E500_LSU_0, E500_FPU_0], + [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass, + E500_CR_Bypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [5, 1, 1], // Latency = 1 or 2 + [E500_CR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0], 0>, + InstrStage<14, [E500_CFX_DivBypass]>], + [17, 1, 1], // Latency=4..35, Repeat= 4..35 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<8, [E500_FPU_0]>], + [11], // Latency = 8 + [E500_FPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<8, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_SFX0]>], + [5, 1], // Latency = 2, Repeat rate = 2 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1], // Latency = 1 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1, 1], // Latency = 1 + [E500_CR_Bypass, + E500_CR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1], // Latency = 1 + [E500_CR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_CR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3, Repeat rate = 1 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStStore, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1], // Latency = r+3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<3, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3, Repeat rate = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>]>, + InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_SFX0, E500_SFX1]>], + [5, 1], // Latency = 2, Repeat rate = 4 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0]>], + [5, 1], + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0], 0>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<5, [E500_SFX0]>], + [8, 1], + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<5, [E500_SFX0]>], + [8, 1], + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [E500_CR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0]>], + [4, 1], + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_FPU_0]>], + [13, 1, 1], // Latency = 10, Repeat rate = 4 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [E500_CR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<68, [E500_FPU_0]>], + [71, 1, 1], // Latency = 68, Repeat rate = 68 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<38, [E500_FPU_0]>], + [41, 1, 1], // Latency = 38, Repeat rate = 38 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_FPU_0]>], + [13, 1, 1, 1], // Latency = 10, Repeat rate = 4 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass, + E500_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<38, [E500_FPU_0]>], + [41, 1], // Latency = 38, Repeat rate = 38 + [E500_FPR_Bypass, E500_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e500mc machine model for scheduling and other instruction cost heuristics. + +def PPCE500mcModel : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 5; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE500mcItineraries; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td new file mode 100644 index 0000000..7c2693e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td @@ -0,0 +1,380 @@ +//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e5500 64-bit +// Power processor. +// +// All information is derived from the "e5500 Core Reference Manual", +// Freescale Document Number e5500RM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e5500 core +// (These are the same as for the e500mc) +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +def E5500_DIS0 : FuncUnit; +def E5500_DIS1 : FuncUnit; + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is being executed. +def E5500_SFX0 : FuncUnit; // Simple unit 0 +def E5500_SFX1 : FuncUnit; // Simple unit 1 +def E5500_BU : FuncUnit; // Branch unit +def E5500_CFX_DivBypass + : FuncUnit; // CFX divide bypass path +def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0 + +def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1 + +def E5500_LSU_0 : FuncUnit; // LSU pipeline +def E5500_FPU_0 : FuncUnit; // FPU pipeline + +def E5500_GPR_Bypass : Bypass; +def E5500_FPR_Bypass : Bypass; +def E5500_CR_Bypass : Bypass; + +def PPCE5500Itineraries : ProcessorItineraries< + [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU, + E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1, + E5500_LSU_0, E5500_FPU_0], + [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass, + E5500_CR_Bypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 1 or 2 + [E5500_CR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<26, [E5500_CFX_DivBypass]>], + [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<16, [E5500_CFX_DivBypass]>], + [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<7, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 7 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IIC_IntMulHD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<2, [E5500_CFX_1]>], + [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<1, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<1, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<2, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4 or 5, Repeat = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotateD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0]>], + [6, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2], // Latency = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2, 2], // Latency = 1 + [E5500_CR_Bypass, + E5500_CR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2], // Latency = 1 + [E5500_CR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0]>], + [5, 2, 2], // Latency = 1 + [E5500_CR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLDARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<3, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStStore, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStICBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLMW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_LSU_0]>], + [8, 2], // Latency = r+3, Repeat rate = r+3 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<3, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 3 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_CFX_0]>], + [6, 2], // Latency = 2, Repeat rate = 4 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0], 0>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<5, [E5500_CFX_0]>], + [9, 2], // Latency = 5, Repeat rate = 5 + [E5500_GPR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<5, [E5500_CFX_0]>], + [9, 2], // Latency = 5, Repeat rate = 5 + [E5500_GPR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_SFX0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0]>], + [5], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_CFX_0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_CR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<31, [E5500_FPU_0]>], + [39, 2, 2], // Latency = 35, Repeat rate = 31 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<16, [E5500_FPU_0]>], + [24, 2, 2], // Latency = 20, Repeat rate = 16 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass, + E5500_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_FPU_0]>], + [12, 2], // Latency = 8, Repeat rate = 2 + [E5500_FPR_Bypass, E5500_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e5500 machine model for scheduling and other instruction cost heuristics. + +def PPCE5500Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 6; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE5500Itineraries; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td new file mode 100644 index 0000000..21efd8f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td @@ -0,0 +1,80 @@ +//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G3 (750) processor. +// +//===----------------------------------------------------------------------===// + +def G3_BPU : FuncUnit; // Branch unit +def G3_SLU : FuncUnit; // Store/load unit +def G3_SRU : FuncUnit; // special register unit +def G3_IU1 : FuncUnit; // integer unit 1 (simple) +def G3_IU2 : FuncUnit; // integer unit 2 (complex) +def G3_FPU1 : FuncUnit; // floating point unit 1 + +def G3Itineraries : ProcessorItineraries< + [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<19, [G3_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G3_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G3_IU1]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G3_IU1]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G3_IU1]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G3_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_LdStDCBA , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<34, [G3_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<8, [G3_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprSC , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<31, [G3_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<17, [G3_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<2, [G3_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<10, [G3_FPU1]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td new file mode 100644 index 0000000..340773e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td @@ -0,0 +1,96 @@ +//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4 (7400) processor. +// +//===----------------------------------------------------------------------===// + +def G4_BPU : FuncUnit; // Branch unit +def G4_SLU : FuncUnit; // Store/load unit +def G4_SRU : FuncUnit; // special register unit +def G4_IU1 : FuncUnit; // integer unit 1 (simple) +def G4_IU2 : FuncUnit; // integer unit 2 (complex) +def G4_FPU1 : FuncUnit; // floating point unit 1 +def G4_VPU : FuncUnit; // vector permutation unit +def G4_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G4_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G4_VFPU : FuncUnit; // vector floating point unit + +def G4Itineraries : ProcessorItineraries< + [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1, + G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<19, [G4_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<3, [G4_FPU1]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G4_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G4_IU1]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G4_IU1]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4_IU1]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G4_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<34, [G4_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<5, [G4_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<8, [G4_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G4_SRU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<8, [G4_SRU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G4_SRU]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprSC , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<31, [G4_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<17, [G4_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<10, [G4_FPU1]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<4, [G4_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<3, [G4_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<1, [G4_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4_VFPU]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<1, [G4_VIU1]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td new file mode 100644 index 0000000..1d9f13f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -0,0 +1,112 @@ +//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4+ (7450) processor. +// +//===----------------------------------------------------------------------===// + +def G4P_BPU : FuncUnit; // Branch unit +def G4P_SLU : FuncUnit; // Store/load unit +def G4P_SRU : FuncUnit; // special register unit +def G4P_IU1 : FuncUnit; // integer unit 1 (simple) +def G4P_IU2 : FuncUnit; // integer unit 2 (complex) +def G4P_IU3 : FuncUnit; // integer unit 3 (simple) +def G4P_IU4 : FuncUnit; // integer unit 4 (simple) +def G4P_FPU1 : FuncUnit; // floating point unit 1 +def G4P_VPU : FuncUnit; // vector permutation unit +def G4P_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G4P_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G4P_VFPU : FuncUnit; // vector floating point unit + +def G4PlusItineraries : ProcessorItineraries< + [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1, + G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<23, [G4P_IU2]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<2, [G4P_VFPU]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntShift , [InstrStage<2, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G4P_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<37, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLWA , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTD , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<35, [G4P_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<0, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<5, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_SprSC , [InstrStage<0, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<35, [G4P_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<21, [G4P_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<14, [G4P_FPU1]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4P_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<4, [G4P_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<4, [G4P_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<2, [G4P_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4P_VIU1]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<2, [G4P_VPU]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<2, [G4P_VPU]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td new file mode 100644 index 0000000..a3b73ab --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td @@ -0,0 +1,129 @@ +//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G5 (970) processor. +// +//===----------------------------------------------------------------------===// + +def G5_BPU : FuncUnit; // Branch unit +def G5_SLU : FuncUnit; // Store/load unit +def G5_SRU : FuncUnit; // special register unit +def G5_IU1 : FuncUnit; // integer unit 1 (simple) +def G5_IU2 : FuncUnit; // integer unit 2 (complex) +def G5_FPU1 : FuncUnit; // floating point unit 1 +def G5_FPU2 : FuncUnit; // floating point unit 2 +def G5_VPU : FuncUnit; // vector permutation unit +def G5_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G5_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G5_VFPU : FuncUnit; // vector floating point unit + +def G5Itineraries : ProcessorItineraries< + [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2, + G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<3, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntDivD , [InstrStage<68, [G5_IU1]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<36, [G5_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<6, [G5_IU2]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G5_VFPU]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_IntMulHD , [InstrStage<7, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<5, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<4, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRFID , [InstrStage<1, [G5_IU2]>]>, + InstrItinData<IIC_IntRotateD , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<4, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntTrapD , [InstrStage<1, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<1, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G5_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<4, [G5_BPU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<2, [G5_BPU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<3, [G5_BPU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<10, [G5_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<40, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<4, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<4, [G5_SLU]>]>, + InstrItinData<IIC_LdStLD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDU , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDUX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDARX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<64, [G5_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLWA , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSLBIA , [InstrStage<40, [G5_SLU]>]>, // needs work + InstrItinData<IIC_LdStSLBIE , [InstrStage<2, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<35, [G5_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<40, [G5_SLU]>]>, // needs work + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<2, [G5_IU2]>]>, + InstrItinData<IIC_SprMFCRF , [InstrStage<2, [G5_IU2]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G5_IU2]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G5_IU2]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<10, [G5_IU2]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<8, [G5_IU2]>]>, + InstrItinData<IIC_SprSC , [InstrStage<1, [G5_IU2]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPFused , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPRes , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPSqrtD , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPSqrtS , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<2, [G5_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<8, [G5_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<5, [G5_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<3, [G5_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<8, [G5_VFPU]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<2, [G5_VIU1]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<3, [G5_VPU]>]> +]>; + +// ===---------------------------------------------------------------------===// +// G5 machine model for scheduling and other instruction cost heuristics. + +def G5Model : SchedMachineModel { + let IssueWidth = 4; // 4 (non-branch) instructions are dispatched per cycle. + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + let Itineraries = G5Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td new file mode 100644 index 0000000..267f567 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td @@ -0,0 +1,396 @@ +//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the POWER7 processor. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// IBM POWER7 multicore server processor +// B. Sinharoy, et al. +// IBM J. Res. & Dev. (55) 3. May/June 2011. + +// Scheduling for the P7 involves tracking two types of resources: +// 1. The dispatch bundle slots +// 2. The functional unit resources + +// Dispatch units: +def P7_DU1 : FuncUnit; +def P7_DU2 : FuncUnit; +def P7_DU3 : FuncUnit; +def P7_DU4 : FuncUnit; +def P7_DU5 : FuncUnit; +def P7_DU6 : FuncUnit; + +def P7_LS1 : FuncUnit; // Load/Store pipeline 1 +def P7_LS2 : FuncUnit; // Load/Store pipeline 2 + +def P7_FX1 : FuncUnit; // FX pipeline 1 +def P7_FX2 : FuncUnit; // FX pipeline 2 + +// VS pipeline 1 (vector integer ops. always here) +def P7_VS1 : FuncUnit; // VS pipeline 1 +// VS pipeline 2 (128-bit stores and perms. here) +def P7_VS2 : FuncUnit; // VS pipeline 2 + +def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) +def P7_BRU : FuncUnit; // BR unit + +// Notes: +// Each LSU pipeline can also execute FX add and logical instructions. +// Each LSU pipeline can complete a load or store in one cycle. +// +// Each store is broken into two parts, AGEN goes to the LSU while a +// "data steering" op. goes to the FXU or VSU. +// +// FX loads have a two cycle load-to-use latency (so one "bubble" cycle). +// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). +// +// Frequent FX ops. take only one cycle and results can be used again in the +// next cycle (there is a self-bypass). Getting results from the other FX +// pipeline takes an additional cycle. +// +// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles +// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. +// Dispatch of an instruction to VS1 that uses four single prec. inputs +// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any +// floating point instruction. +// +// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles +// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline +// (unlike on the POWER6). +// +// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP +// share the same write-back, and have a 5-cycle latency difference, so the +// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP +// op. has been dispatched to VS1. +// +// Three cycles after an L1 cache hit, a dependent VSU instruction can issue. +// +// Instruction dispatch groups have (at most) four non-branch instructions, and +// two branches. Unlike on the POWER4/5, a branch does not automatically +// end the dispatch group, but a second branch must be the last in the group. + +def P7Itineraries : ProcessorItineraries< + [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, + P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2, + P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_FX1, P7_FX2], 0>, + InstrStage<1, [P7_BRU]>], + [1, 1, 1, 1]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + // FIXME: Add record-form itinerary data. + InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<36, [P7_FX1, P7_FX2]>], + [36, 1, 1]>, + InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<68, [P7_FX1, P7_FX2]>], + [68, 1, 1]>, + InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_CRU]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1]>, // mtcr + InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [6, 1]>, + InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [3, 1]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_FX1]>], + [4, 1]>, // mtctr + InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [8, 1, 1]>, + InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [33, 1, 1]>, + InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [27, 1, 1]>, + InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [44, 1, 1]>, + InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [32, 1, 1]>, + InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1, 1]>, + InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [7, 1, 1]>, + InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>, + InstrStage<1, [P7_VS2]>], + [3, 1, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// P7 machine model for scheduling and other instruction cost heuristics. + +def P7Model : SchedMachineModel { + let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. + // Note that the dispatch bundle size is 6 (including + // branches), but the total internal issue bandwidth per + // cycle (from all queues) is 8. + + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + // Try to make sure we have at least 10 dispatch groups in a loop. + let LoopMicroOpBufferSize = 40; + + let Itineraries = P7Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td new file mode 100644 index 0000000..69e6d05 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td @@ -0,0 +1,405 @@ +//===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the POWER8 processor. +// +//===----------------------------------------------------------------------===// + +// Scheduling for the P8 involves tracking two types of resources: +// 1. The dispatch bundle slots +// 2. The functional unit resources + +// Dispatch units: +def P8_DU1 : FuncUnit; +def P8_DU2 : FuncUnit; +def P8_DU3 : FuncUnit; +def P8_DU4 : FuncUnit; +def P8_DU5 : FuncUnit; +def P8_DU6 : FuncUnit; +def P8_DU7 : FuncUnit; // Only branch instructions will use DU7,DU8 +def P8_DU8 : FuncUnit; + +// 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU). + +def P8_LU1 : FuncUnit; // Loads or fixed-point operations 1 +def P8_LU2 : FuncUnit; // Loads or fixed-point operations 2 + +// Load/Store pipelines can handle Stores, fixed-point loads, and simple +// fixed-point operations. +def P8_LSU1 : FuncUnit; // Load/Store pipeline 1 +def P8_LSU2 : FuncUnit; // Load/Store pipeline 2 + +// Fixed Point unit +def P8_FXU1 : FuncUnit; // FX pipeline 1 +def P8_FXU2 : FuncUnit; // FX pipeline 2 + +// The Floating-Point Unit (FPU) and Vector Media Extension (VMX) units +// are combined on P7 and newer into a Vector Scalar Unit (VSU). +// The P8 Instruction latency documents still refers to the unit as the +// FPU, so keep in mind that FPU==VSU. +// In contrast to the P7, the VMX units on P8 are symmetric, so no need to +// split vector integer ops or 128-bit load/store/perms to the specific units. +def P8_FPU1 : FuncUnit; // VS pipeline 1 +def P8_FPU2 : FuncUnit; // VS pipeline 2 + +def P8_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) +def P8_BRU : FuncUnit; // BR unit + +def P8Itineraries : ProcessorItineraries< + [P8_DU1, P8_DU2, P8_DU3, P8_DU4, P8_DU5, P8_DU6, P8_DU7, P8_DU8, + P8_LU1, P8_LU2, P8_LSU1, P8_LSU2, P8_FXU1, P8_FXU2, + P8_FPU1, P8_FPU2, P8_CRU, P8_BRU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2, + P8_LU1, P8_LU2, + P8_LSU1, P8_LSU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2, P8_LU1, + P8_LU2, P8_LSU1, P8_LSU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntISEL, [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2], 0>, + InstrStage<1, [P8_BRU]>], + [1, 1, 1, 1]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntDivW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<15, [P8_FXU1, P8_FXU2]>], + [15, 1, 1]>, + InstrItinData<IIC_IntDivD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<23, [P8_FXU1, P8_FXU2]>], + [23, 1, 1]>, + InstrItinData<IIC_IntMulHW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulLI , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntRotateD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntTrapW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1]>, + InstrItinData<IIC_IntTrapD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [1, 1]>, + InstrItinData<IIC_BrB , [InstrStage<1, [P8_DU7, P8_DU8], 0>, + InstrStage<1, [P8_BRU]>], + [3, 1, 1]>, + // FIXME - the Br* groups below are not branch related, so should probably + // be renamed. + // IIC_BrCR consists of the cr* instructions. (crand,crnor,creqv, etc). + // and should be 'First' in dispatch. + InstrItinData<IIC_BrCR , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_CRU]>], + [3, 1, 1]>, + // IIC_BrMCR consists of the mcrf instruction. + InstrItinData<IIC_BrMCR , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_CRU]>], + [3, 1, 1]>, + // IIC_BrMCRX consists of mcrxr (obsolete instruction) and mtcrf, which + // should be first in the dispatch group. + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 1, 1]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 1]>, + InstrItinData<IIC_LdStLoad , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2 ], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [2, 2, 1, 1]>, + // Update-Indexed form loads/stores are no longer first and last in the + // dispatch group. They are simply cracked, so require DU1,DU2. + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLDU , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [2, 2, 1, 1]>, + InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LU1, P8_LU2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LU1, P8_LU2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLHA , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2, + P8_LU1, P8_LU2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 4, 1, 1]>, + // first+last in dispatch group. + InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_DU5], 0>, + InstrStage<1, [P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 4, 1, 1]>, + InstrItinData<IIC_LdStLWA , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>], + [3, 1, 1]>, + // first+last + InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_DU5], 0>, + InstrStage<1, [P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLMW , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2, + P8_LU1, P8_LU2]>], + [2, 1, 1]>, +// Stores are dual-issued from the issue queue, so may only take up one +// dispatch slot. The instruction will be broken into two IOPS. The agen +// op is issued to the LSU, and the data op (register fetch) is issued +// to either the LU (GPR store) or the VSU (FPR store). + InstrItinData<IIC_LdStStore , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2]>, + InstrStage<1, [P8_LU1, P8_LU2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LU1, P8_LU2, + P8_LSU1, P8_LSU2]>] + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LU1, P8_LU2, + P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [2, 1, 1, 1]>, + // First+last + InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_DU5], 0>, + InstrStage<1, [P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_DU5], 0>, + InstrStage<1, [P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_LU1, P8_LU2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_DU2], 0>, + InstrStage<1, [P8_DU3], 0>, + InstrStage<1, [P8_DU4], 0>, + InstrStage<1, [P8_DU5], 0>, + InstrStage<1, [P8_DU6], 0>, + InstrStage<1, [P8_LSU1, P8_LSU2], 0>, + InstrStage<1, [P8_LU1, P8_LU2]>], + [1, 1, 1]>, + InstrItinData<IIC_SprMFCR , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_CRU]>], + [6, 1]>, + InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_CRU]>], + [3, 1]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FXU1, P8_FXU2]>], + [4, 1]>, // mtctr + InstrItinData<IIC_FPGeneral , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [5, 1, 1]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [5, 1, 1]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [8, 1, 1]>, + InstrItinData<IIC_FPDivD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [33, 1, 1]>, + InstrItinData<IIC_FPDivS , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [27, 1, 1]>, + InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [44, 1, 1]>, + InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [32, 1, 1]>, + InstrItinData<IIC_FPFused , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [5, 1, 1, 1]>, + InstrItinData<IIC_FPRes , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [5, 1, 1]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSL , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSR , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [2, 1, 1]>, + InstrItinData<IIC_VecFP , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPRound , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecComplex , [InstrStage<1, [P8_DU1], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [7, 1, 1]>, + InstrItinData<IIC_VecPerm , [InstrStage<1, [P8_DU1, P8_DU2], 0>, + InstrStage<1, [P8_FPU2, P8_FPU2]>], + [3, 1, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// P8 machine model for scheduling and other instruction cost heuristics. +// P8 has an 8 insn dispatch group (6 non-branch, 2 branch) and can issue up +// to 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU). + +def P8Model : SchedMachineModel { + let IssueWidth = 8; // up to 8 instructions dispatched per cycle. + // up to six non-branch instructions. + // up to two branches in a dispatch group. + + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + // Try to make sure we have at least 10 dispatch groups in a loop. + let LoopMicroOpBufferSize = 60; + + let Itineraries = P8Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp new file mode 100644 index 0000000..c0fcb6c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -0,0 +1,245 @@ +//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPC specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "PPCSubtarget.h" +#include "PPC.h" +#include "PPCRegisterInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include <cstdlib> + +using namespace llvm; + +#define DEBUG_TYPE "ppc-subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "PPCGenSubtargetInfo.inc" + +static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness", +cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); + +static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned", + cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), + cl::Hidden); + +PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + initSubtargetFeatures(CPU, FS); + return *this; +} + +PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, + const std::string &FS, const PPCTargetMachine &TM) + : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), + IsPPC64(TargetTriple.getArch() == Triple::ppc64 || + TargetTriple.getArch() == Triple::ppc64le), + TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)), + InstrInfo(*this), TLInfo(TM, *this) {} + +void PPCSubtarget::initializeEnvironment() { + StackAlignment = 16; + DarwinDirective = PPC::DIR_NONE; + HasMFOCRF = false; + Has64BitSupport = false; + Use64BitRegs = false; + UseCRBits = false; + UseSoftFloat = false; + HasAltivec = false; + HasSPE = false; + HasQPX = false; + HasVSX = false; + HasP8Vector = false; + HasP8Altivec = false; + HasP8Crypto = false; + HasFCPSGN = false; + HasFSQRT = false; + HasFRE = false; + HasFRES = false; + HasFRSQRTE = false; + HasFRSQRTES = false; + HasRecipPrec = false; + HasSTFIWX = false; + HasLFIWAX = false; + HasFPRND = false; + HasFPCVT = false; + HasISEL = false; + HasPOPCNTD = false; + HasBPERMD = false; + HasExtDiv = false; + HasCMPB = false; + HasLDBRX = false; + IsBookE = false; + HasOnlyMSYNC = false; + IsPPC4xx = false; + IsPPC6xx = false; + IsE500 = false; + FeatureMFTB = false; + DeprecatedDST = false; + HasLazyResolverStubs = false; + HasICBT = false; + HasInvariantFunctionDescriptors = false; + HasPartwordAtomics = false; + HasDirectMove = false; + IsQPXStackUnaligned = false; + HasHTM = false; + HasFusion = false; + HasFloat128 = false; +} + +void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { + // Determine default and user specified characteristics + std::string CPUName = CPU; + if (CPUName.empty()) { + // If cross-compiling with -march=ppc64le without -mcpu + if (TargetTriple.getArch() == Triple::ppc64le) + CPUName = "ppc64le"; + else + CPUName = "generic"; + } + + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUName); + + // Parse features string. + ParseSubtargetFeatures(CPUName, FS); + + // If the user requested use of 64-bit regs, but the cpu selected doesn't + // support it, ignore. + if (IsPPC64 && has64BitSupport()) + Use64BitRegs = true; + + // Set up darwin-specific properties. + if (isDarwin()) + HasLazyResolverStubs = true; + + // QPX requires a 32-byte aligned stack. Note that we need to do this if + // we're compiling for a BG/Q system regardless of whether or not QPX + // is enabled because external functions will assume this alignment. + IsQPXStackUnaligned = QPXStackUnaligned; + StackAlignment = getPlatformStackAlignment(); + + // Determine endianness. + // FIXME: Part of the TargetMachine. + IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le); +} + +/// hasLazyResolverStub - Return true if accesses to the specified global have +/// to go through a dyld lazy resolution stub. This means that an extra load +/// is required to get the address of the global. +bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const { + // We never have stubs if HasLazyResolverStubs=false or if in static mode. + if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static) + return false; + bool isDecl = GV->isDeclaration(); + if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage()) + return false; + return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() || + GV->hasCommonLinkage() || isDecl; +} + +// Embedded cores need aggressive scheduling (and some others also benefit). +static bool needsAggressiveScheduling(unsigned Directive) { + switch (Directive) { + default: return false; + case PPC::DIR_440: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + return true; + } +} + +bool PPCSubtarget::enableMachineScheduler() const { + // Enable MI scheduling for the embedded cores. + // FIXME: Enable this for all cores (some additional modeling + // may be necessary). + return needsAggressiveScheduling(DarwinDirective); +} + +// This overrides the PostRAScheduler bit in the SchedModel for each CPU. +bool PPCSubtarget::enablePostRAScheduler() const { return true; } + +PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const { + return TargetSubtargetInfo::ANTIDEP_ALL; +} + +void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(isPPC64() ? + &PPC::G8RCRegClass : &PPC::GPRCRegClass); +} + +void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (needsAggressiveScheduling(DarwinDirective)) { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } + + // Spilling is generally expensive on all PPC cores, so always enable + // register-pressure tracking. + Policy.ShouldTrackPressure = true; +} + +bool PPCSubtarget::useAA() const { + // Use AA during code generation for the embedded cores. + return needsAggressiveScheduling(DarwinDirective); +} + +bool PPCSubtarget::enableSubRegLiveness() const { + return UseSubRegLiveness; +} + +unsigned char PPCSubtarget::classifyGlobalReference( + const GlobalValue *GV) const { + // Note that currently we don't generate non-pic references. + // If a caller wants that, this will have to be updated. + + // Large code model always uses the TOC even for local symbols. + if (TM.getCodeModel() == CodeModel::Large) + return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; + + unsigned char flags = PPCII::MO_PIC_FLAG; + + // Only if the relocation mode is PIC do we have to worry about + // interposition. In all other cases we can use a slightly looser standard to + // decide how to access the symbol. + if (TM.getRelocationModel() == Reloc::PIC_) { + // If it's local, or it's non-default, it can't be interposed. + if (!GV->hasLocalLinkage() && + GV->hasDefaultVisibility()) { + flags |= PPCII::MO_NLP_FLAG; + } + return flags; + } + + if (GV->isStrongDefinitionForLinker()) + return flags; + return flags | PPCII::MO_NLP_FLAG; +} + +bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } +bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h new file mode 100644 index 0000000..4f5c95c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -0,0 +1,302 @@ +//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H +#define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H + +#include "PPCFrameLowering.h" +#include "PPCISelLowering.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "PPCGenSubtargetInfo.inc" + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { +class StringRef; + +namespace PPC { + // -m directive values. + enum { + DIR_NONE, + DIR_32, + DIR_440, + DIR_601, + DIR_602, + DIR_603, + DIR_7400, + DIR_750, + DIR_970, + DIR_A2, + DIR_E500mc, + DIR_E5500, + DIR_PWR3, + DIR_PWR4, + DIR_PWR5, + DIR_PWR5X, + DIR_PWR6, + DIR_PWR6X, + DIR_PWR7, + DIR_PWR8, + DIR_64 + }; +} + +class GlobalValue; +class TargetMachine; + +class PPCSubtarget : public PPCGenSubtargetInfo { +protected: + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned StackAlignment; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + /// Which cpu directive was used. + unsigned DarwinDirective; + + /// Used by the ISel to turn in optimizations for POWER4-derived architectures + bool HasMFOCRF; + bool Has64BitSupport; + bool Use64BitRegs; + bool UseCRBits; + bool UseSoftFloat; + bool IsPPC64; + bool HasAltivec; + bool HasSPE; + bool HasQPX; + bool HasVSX; + bool HasP8Vector; + bool HasP8Altivec; + bool HasP8Crypto; + bool HasFCPSGN; + bool HasFSQRT; + bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; + bool HasRecipPrec; + bool HasSTFIWX; + bool HasLFIWAX; + bool HasFPRND; + bool HasFPCVT; + bool HasISEL; + bool HasPOPCNTD; + bool HasBPERMD; + bool HasExtDiv; + bool HasCMPB; + bool HasLDBRX; + bool IsBookE; + bool HasOnlyMSYNC; + bool IsE500; + bool IsPPC4xx; + bool IsPPC6xx; + bool FeatureMFTB; + bool DeprecatedDST; + bool HasLazyResolverStubs; + bool IsLittleEndian; + bool HasICBT; + bool HasInvariantFunctionDescriptors; + bool HasPartwordAtomics; + bool HasDirectMove; + bool HasHTM; + bool HasFusion; + bool HasFloat128; + + /// When targeting QPX running a stock PPC64 Linux kernel where the stack + /// alignment has not been changed, we need to keep the 16-byte alignment + /// of the stack. + bool IsQPXStackUnaligned; + + const PPCTargetMachine &TM; + PPCFrameLowering FrameLowering; + PPCInstrInfo InstrInfo; + PPCTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const PPCTargetMachine &TM); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return StackAlignment; } + + /// getDarwinDirective - Returns the -m directive specified for the cpu. + /// + unsigned getDarwinDirective() const { return DarwinDirective; } + + /// getInstrItins - Return the instruction itineraries based on subtarget + /// selection. + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + const PPCFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; } + const PPCTargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const PPCRegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } + const PPCTargetMachine &getTargetMachine() const { return TM; } + + /// initializeSubtargetDependencies - Initializes using a CPU and feature string + /// so that we can use initializer lists for subtarget initialization. + PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + +private: + void initializeEnvironment(); + void initSubtargetFeatures(StringRef CPU, StringRef FS); + +public: + /// isPPC64 - Return true if we are generating code for 64-bit pointer mode. + /// + bool isPPC64() const; + + /// has64BitSupport - Return true if the selected CPU supports 64-bit + /// instructions, regardless of whether we are in 32-bit or 64-bit mode. + bool has64BitSupport() const { return Has64BitSupport; } + // useSoftFloat - Return true if soft-float option is turned on. + bool useSoftFloat() const { return UseSoftFloat; } + + /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit + /// registers in 32-bit mode when possible. This can only true if + /// has64BitSupport() returns true. + bool use64BitRegs() const { return Use64BitRegs; } + + /// useCRBits - Return true if we should store and manipulate i1 values in + /// the individual condition register bits. + bool useCRBits() const { return UseCRBits; } + + /// hasLazyResolverStub - Return true if accesses to the specified global have + /// to go through a dyld lazy resolution stub. This means that an extra load + /// is required to get the address of the global. + bool hasLazyResolverStub(const GlobalValue *GV) const; + + // isLittleEndian - True if generating little-endian code + bool isLittleEndian() const { return IsLittleEndian; } + + // Specific obvious features. + bool hasFCPSGN() const { return HasFCPSGN; } + bool hasFSQRT() const { return HasFSQRT; } + bool hasFRE() const { return HasFRE; } + bool hasFRES() const { return HasFRES; } + bool hasFRSQRTE() const { return HasFRSQRTE; } + bool hasFRSQRTES() const { return HasFRSQRTES; } + bool hasRecipPrec() const { return HasRecipPrec; } + bool hasSTFIWX() const { return HasSTFIWX; } + bool hasLFIWAX() const { return HasLFIWAX; } + bool hasFPRND() const { return HasFPRND; } + bool hasFPCVT() const { return HasFPCVT; } + bool hasAltivec() const { return HasAltivec; } + bool hasSPE() const { return HasSPE; } + bool hasQPX() const { return HasQPX; } + bool hasVSX() const { return HasVSX; } + bool hasP8Vector() const { return HasP8Vector; } + bool hasP8Altivec() const { return HasP8Altivec; } + bool hasP8Crypto() const { return HasP8Crypto; } + bool hasMFOCRF() const { return HasMFOCRF; } + bool hasISEL() const { return HasISEL; } + bool hasPOPCNTD() const { return HasPOPCNTD; } + bool hasBPERMD() const { return HasBPERMD; } + bool hasExtDiv() const { return HasExtDiv; } + bool hasCMPB() const { return HasCMPB; } + bool hasLDBRX() const { return HasLDBRX; } + bool isBookE() const { return IsBookE; } + bool hasOnlyMSYNC() const { return HasOnlyMSYNC; } + bool isPPC4xx() const { return IsPPC4xx; } + bool isPPC6xx() const { return IsPPC6xx; } + bool isE500() const { return IsE500; } + bool isFeatureMFTB() const { return FeatureMFTB; } + bool isDeprecatedDST() const { return DeprecatedDST; } + bool hasICBT() const { return HasICBT; } + bool hasInvariantFunctionDescriptors() const { + return HasInvariantFunctionDescriptors; + } + bool hasPartwordAtomics() const { return HasPartwordAtomics; } + bool hasDirectMove() const { return HasDirectMove; } + + bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } + unsigned getPlatformStackAlignment() const { + if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) + return 32; + + return 16; + } + bool hasHTM() const { return HasHTM; } + bool hasFusion() const { return HasFusion; } + bool hasFloat128() const { return HasFloat128; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + /// isDarwin - True if this is any darwin platform. + bool isDarwin() const { return TargetTriple.isMacOSX(); } + /// isBGQ - True if this is a BG/Q platform. + bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } + + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + bool isDarwinABI() const { return isTargetMachO() || isDarwin(); } + bool isSVR4ABI() const { return !isDarwinABI(); } + bool isELFv2ABI() const; + + bool enableEarlyIfConversion() const override { return hasISEL(); } + + // Scheduling customization. + bool enableMachineScheduler() const override; + // This overrides the PostRAScheduler bit in the SchedModel for each CPU. + bool enablePostRAScheduler() const override; + AntiDepBreakMode getAntiDepBreakMode() const override; + void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const override; + bool useAA() const override; + + bool enableSubRegLiveness() const override; + + /// classifyGlobalReference - Classify a global variable reference for the + /// current subtarget accourding to how we should reference it. + unsigned char classifyGlobalReference(const GlobalValue *GV) const; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp new file mode 100644 index 0000000..2dc0d82 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -0,0 +1,170 @@ +//===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass expands ADDItls{ld,gd}LADDR[32] machine instructions into +// separate ADDItls[gd]L[32] and GETtlsADDR[32] instructions, both of +// which define GPR3. A copy is added from GPR3 to the target virtual +// register of the original instruction. The GETtlsADDR[32] is really +// a call instruction, so its target register is constrained to be GPR3. +// This is not true of ADDItls[gd]L[32], but there is a legacy linker +// optimization bug that requires the target register of the addi of +// a local- or general-dynamic TLS access sequence to be GPR3. +// +// This is done in a late pass so that TLS variable accesses can be +// fully commoned by MachineCSE. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-tls-dynamic-call" + +namespace llvm { + void initializePPCTLSDynamicCallPass(PassRegistry&); +} + +namespace { + struct PPCTLSDynamicCall : public MachineFunctionPass { + static char ID; + PPCTLSDynamicCall() : MachineFunctionPass(ID) { + initializePPCTLSDynamicCallPass(*PassRegistry::getPassRegistry()); + } + + const PPCInstrInfo *TII; + LiveIntervals *LIS; + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64(); + + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE;) { + MachineInstr *MI = I; + + if (MI->getOpcode() != PPC::ADDItlsgdLADDR && + MI->getOpcode() != PPC::ADDItlsldLADDR && + MI->getOpcode() != PPC::ADDItlsgdLADDR32 && + MI->getOpcode() != PPC::ADDItlsldLADDR32) { + ++I; + continue; + } + + DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << *MI;); + + unsigned OutReg = MI->getOperand(0).getReg(); + unsigned InReg = MI->getOperand(1).getReg(); + DebugLoc DL = MI->getDebugLoc(); + unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3; + unsigned Opc1, Opc2; + SmallVector<unsigned, 4> OrigRegs; + OrigRegs.push_back(OutReg); + OrigRegs.push_back(InReg); + OrigRegs.push_back(GPR3); + + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode inconsistency error"); + case PPC::ADDItlsgdLADDR: + Opc1 = PPC::ADDItlsgdL; + Opc2 = PPC::GETtlsADDR; + break; + case PPC::ADDItlsldLADDR: + Opc1 = PPC::ADDItlsldL; + Opc2 = PPC::GETtlsldADDR; + break; + case PPC::ADDItlsgdLADDR32: + Opc1 = PPC::ADDItlsgdL32; + Opc2 = PPC::GETtlsADDR32; + break; + case PPC::ADDItlsldLADDR32: + Opc1 = PPC::ADDItlsldL32; + Opc2 = PPC::GETtlsldADDR32; + break; + } + + // Expand into two ops built prior to the existing instruction. + MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) + .addReg(InReg); + Addi->addOperand(MI->getOperand(2)); + + // The ADDItls* instruction is the first instruction in the + // repair range. + MachineBasicBlock::iterator First = I; + --First; + + MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3) + .addReg(GPR3)); + Call->addOperand(MI->getOperand(3)); + + BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) + .addReg(GPR3); + + // The COPY is the last instruction in the repair range. + MachineBasicBlock::iterator Last = I; + --Last; + + // Move past the original instruction and remove it. + ++I; + MI->removeFromParent(); + + // Repair the live intervals. + LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs); + Changed = true; + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo(); + LIS = &getAnalysis<LiveIntervals>(); + + bool Changed = false; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addRequired<SlotIndexes>(); + AU.addPreserved<SlotIndexes>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS_BEGIN(PPCTLSDynamicCall, DEBUG_TYPE, + "PowerPC TLS Dynamic Call Fixup", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(PPCTLSDynamicCall, DEBUG_TYPE, + "PowerPC TLS Dynamic Call Fixup", false, false) + +char PPCTLSDynamicCall::ID = 0; +FunctionPass* +llvm::createPPCTLSDynamicCallPass() { return new PPCTLSDynamicCall(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp new file mode 100644 index 0000000..bf165c9 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp @@ -0,0 +1,156 @@ +//===-- PPCTOCRegDeps.cpp - Add Extra TOC Register Dependencies -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// When resolving an address using the ELF ABI TOC pointer, two relocations are +// generally required: one for the high part and one for the low part. Only +// the high part generally explicitly depends on r2 (the TOC pointer). And, so, +// we might produce code like this: +// +// .Ltmp526: +// addis 3, 2, .LC12@toc@ha +// .Ltmp1628: +// std 2, 40(1) +// ld 5, 0(27) +// ld 2, 8(27) +// ld 11, 16(27) +// ld 3, .LC12@toc@l(3) +// rldicl 4, 4, 0, 32 +// mtctr 5 +// bctrl +// ld 2, 40(1) +// +// And there is nothing wrong with this code, as such, but there is a linker bug +// in binutils (https://sourceware.org/bugzilla/show_bug.cgi?id=18414) that will +// misoptimize this code sequence to this: +// nop +// std r2,40(r1) +// ld r5,0(r27) +// ld r2,8(r27) +// ld r11,16(r27) +// ld r3,-32472(r2) +// clrldi r4,r4,32 +// mtctr r5 +// bctrl +// ld r2,40(r1) +// because the linker does not know (and does not check) that the value in r2 +// changed in between the instruction using the .LC12@toc@ha (TOC-relative) +// relocation and the instruction using the .LC12@toc@l(3) relocation. +// Because it finds these instructions using the relocations (and not by +// scanning the instructions), it has been asserted that there is no good way +// to detect the change of r2 in between. As a result, this bug may never be +// fixed (i.e. it may become part of the definition of the ABI). GCC was +// updated to add extra dependencies on r2 to instructions using the @toc@l +// relocations to avoid this problem, and we'll do the same here. +// +// This is done as a separate pass because: +// 1. These extra r2 dependencies are not really properties of the +// instructions, but rather due to a linker bug, and maybe one day we'll be +// able to get rid of them when targeting linkers without this bug (and, +// thus, keeping the logic centralized here will make that +// straightforward). +// 2. There are ISel-level peephole optimizations that propagate the @toc@l +// relocations to some user instructions, and so the exta dependencies do +// not apply only to a fixed set of instructions (without undesirable +// definition replication). +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-toc-reg-deps" + +namespace llvm { + void initializePPCTOCRegDepsPass(PassRegistry&); +} + +namespace { + // PPCTOCRegDeps pass - For simple functions without epilogue code, move + // returns up, and create conditional returns, to avoid unnecessary + // branch-to-blr sequences. + struct PPCTOCRegDeps : public MachineFunctionPass { + static char ID; + PPCTOCRegDeps() : MachineFunctionPass(ID) { + initializePPCTOCRegDepsPass(*PassRegistry::getPassRegistry()); + } + +protected: + bool hasTOCLoReloc(const MachineInstr &MI) { + if (MI.getOpcode() == PPC::LDtocL || + MI.getOpcode() == PPC::ADDItocL) + return true; + + for (const MachineOperand &MO : MI.operands()) { + if ((MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) == PPCII::MO_TOC_LO) + return true; + } + + return false; + } + + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + for (auto &MI : MBB) { + if (!hasTOCLoReloc(MI)) + continue; + + MI.addOperand(MachineOperand::CreateReg(PPC::X2, + false /*IsDef*/, + true /*IsImp*/)); + Changed = true; + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + bool Changed = false; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCTOCRegDeps, DEBUG_TYPE, + "PowerPC TOC Register Dependencies", false, false) + +char PPCTOCRegDeps::ID = 0; +FunctionPass* +llvm::createPPCTOCRegDepsPass() { return new PPCTOCRegDeps(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp new file mode 100644 index 0000000..d24b590 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -0,0 +1,407 @@ +//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Top-level implementation for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" +#include "PPC.h" +#include "PPCTargetObjectFile.h" +#include "PPCTargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +static cl:: +opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden, + cl::desc("Disable CTR loops for PPC")); + +static cl:: +opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden, + cl::desc("Disable PPC loop preinc prep")); + +static cl::opt<bool> +VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early", + cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early")); + +static cl:: +opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, + cl::desc("Disable VSX Swap Removal for PPC")); + +static cl:: +opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden, + cl::desc("Disable machine peepholes for PPC")); + +static cl::opt<bool> +EnableGEPOpt("ppc-gep-opt", cl::Hidden, + cl::desc("Enable optimizations on complex GEPs"), + cl::init(true)); + +static cl::opt<bool> +EnablePrefetch("enable-ppc-prefetching", + cl::desc("disable software prefetching on PPC"), + cl::init(false), cl::Hidden); + +static cl::opt<bool> +EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps", + cl::desc("Add extra TOC register dependencies"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableMachineCombinerPass("ppc-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + +extern "C" void LLVMInitializePowerPCTarget() { + // Register the targets + RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target); + RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target); + RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializePPCBoolRetToIntPass(PR); +} + +/// Return the datalayout string of a subtarget. +static std::string getDataLayoutString(const Triple &T) { + bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le; + std::string Ret; + + // Most PPC* platforms are big endian, PPC64LE is little endian. + if (T.getArch() == Triple::ppc64le) + Ret = "e"; + else + Ret = "E"; + + Ret += DataLayout::getManglingComponent(T); + + // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit + // pointers. + if (!is64Bit || T.getOS() == Triple::Lv2) + Ret += "-p:32:32"; + + // Note, the alignment values for f64 and i64 on ppc64 in Darwin + // documentation are wrong; these are correct (i.e. "what gcc does"). + if (is64Bit || !T.isOSDarwin()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. + if (is64Bit) + Ret += "-n32:64"; + else + Ret += "-n32"; + + return Ret; +} + +static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, + const Triple &TT) { + std::string FullFS = FS; + + // Make sure 64-bit features are available when CPUname is generic + if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) { + if (!FullFS.empty()) + FullFS = "+64bit," + FullFS; + else + FullFS = "+64bit"; + } + + if (OL >= CodeGenOpt::Default) { + if (!FullFS.empty()) + FullFS = "+crbits," + FullFS; + else + FullFS = "+crbits"; + } + + if (OL != CodeGenOpt::None) { + if (!FullFS.empty()) + FullFS = "+invariant-function-descriptors," + FullFS; + else + FullFS = "+invariant-function-descriptors"; + } + + return FullFS; +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + // If it isn't a Mach-O file then it's going to be a linux ELF + // object file. + if (TT.isOSDarwin()) + return make_unique<TargetLoweringObjectFileMachO>(); + + return make_unique<PPC64LinuxTargetObjectFile>(); +} + +static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, + const TargetOptions &Options) { + if (Options.MCOptions.getABIName().startswith("elfv1")) + return PPCTargetMachine::PPC_ABI_ELFv1; + else if (Options.MCOptions.getABIName().startswith("elfv2")) + return PPCTargetMachine::PPC_ABI_ELFv2; + + assert(Options.MCOptions.getABIName().empty() && + "Unknown target-abi option!"); + + if (!TT.isMacOSX()) { + switch (TT.getArch()) { + case Triple::ppc64le: + return PPCTargetMachine::PPC_ABI_ELFv2; + case Triple::ppc64: + return PPCTargetMachine::PPC_ABI_ELFv1; + default: + // Fallthrough. + ; + } + } + return PPCTargetMachine::PPC_ABI_UNKNOWN; +} + +// The FeatureString here is a little subtle. We are modifying the feature +// string with what are (currently) non-function specific overrides as it goes +// into the LLVMTargetMachine constructor and then using the stored value in the +// Subtarget constructor below it. +PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, + computeFSAdditions(FS, OL, TT), Options, RM, CM, OL), + TLOF(createTLOF(getTargetTriple())), + TargetABI(computeTargetABI(TT, Options)), + Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) { + + // For the estimates, convergence is quadratic, so we essentially double the + // number of digits correct after every iteration. For both FRE and FRSQRTE, + // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), + // this is 2^-14. IEEE float has 23 digits and double has 52 digits. + unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3, + RefinementSteps64 = RefinementSteps + 1; + + this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps); + + this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64); + + initAsmInfo(); +} + +PPCTargetMachine::~PPCTargetMachine() {} + +void PPC32TargetMachine::anchor() { } + +PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +void PPC64TargetMachine::anchor() { } + +PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const PPCSubtarget * +PPCTargetMachine::getSubtargetImpl(const Function &F) const { + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<PPCSubtarget>( + TargetTriple, CPU, + // FIXME: It would be good to have the subtarget additions here + // not necessary. Anything that turns them on/off (overrides) ends + // up being put at the end of the feature string, but the defaults + // shouldn't require adding them. Fixing this means pulling Feature64Bit + // out of most of the target cpus in the .td file and making it set only + // as part of initialization via the TargetTriple. + computeFSAdditions(FS, getOptLevel(), getTargetTriple()), *this); + } + return I.get(); +} + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +namespace { +/// PPC Code Generator Pass Configuration Options. +class PPCPassConfig : public TargetPassConfig { +public: + PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + PPCTargetMachine &getPPCTargetMachine() const { + return getTM<PPCTargetMachine>(); + } + + void addIRPasses() override; + bool addPreISel() override; + bool addILPOpts() override; + bool addInstSelector() override; + void addMachineSSAOptimization() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; +} // namespace + +TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { + return new PPCPassConfig(this, PM); +} + +void PPCPassConfig::addIRPasses() { + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createPPCBoolRetToIntPass()); + addPass(createAtomicExpandPass(&getPPCTargetMachine())); + + // For the BG/Q (or if explicitly requested), add explicit data prefetch + // intrinsics. + bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && + getOptLevel() != CodeGenOpt::None; + if (EnablePrefetch.getNumOccurrences() > 0) + UsePrefetching = EnablePrefetch; + if (UsePrefetching) + addPass(createPPCLoopDataPrefetchPass()); + + if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { + // Call SeparateConstOffsetFromGEP pass to extract constants within indices + // and lower a GEP with multiple indices to either arithmetic operations or + // multiple GEPs with single index. + addPass(createSeparateConstOffsetFromGEPPass(TM, true)); + // Call EarlyCSE pass to find and remove subexpressions in the lowered + // result. + addPass(createEarlyCSEPass()); + // Do loop invariant code motion in case part of the lowered result is + // invariant. + addPass(createLICMPass()); + } + + TargetPassConfig::addIRPasses(); +} + +bool PPCPassConfig::addPreISel() { + if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None) + addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); + + if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) + addPass(createPPCCTRLoops(getPPCTargetMachine())); + + return false; +} + +bool PPCPassConfig::addILPOpts() { + addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + + return true; +} + +bool PPCPassConfig::addInstSelector() { + // Install an instruction selector. + addPass(createPPCISelDag(getPPCTargetMachine())); + +#ifndef NDEBUG + if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) + addPass(createPPCCTRLoopsVerify()); +#endif + + addPass(createPPCVSXCopyPass()); + return false; +} + +void PPCPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + // For little endian, remove where possible the vector swap instructions + // introduced at code generation to normalize vector element order. + if (TM->getTargetTriple().getArch() == Triple::ppc64le && + !DisableVSXSwapRemoval) + addPass(createPPCVSXSwapRemovalPass()); + // Target-specific peephole cleanups performed after instruction + // selection. + if (!DisableMIPeephole) { + addPass(createPPCMIPeepholePass()); + addPass(&DeadMachineInstructionElimID); + } +} + +void PPCPassConfig::addPreRegAlloc() { + initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); + insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID, + &PPCVSXFMAMutateID); + if (getPPCTargetMachine().getRelocationModel() == Reloc::PIC_) + addPass(createPPCTLSDynamicCallPass()); + if (EnableExtraTOCRegDeps) + addPass(createPPCTOCRegDepsPass()); +} + +void PPCPassConfig::addPreSched2() { + if (getOptLevel() != CodeGenOpt::None) + addPass(&IfConverterID); +} + +void PPCPassConfig::addPreEmitPass() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createPPCEarlyReturnPass(), false); + // Must run branch selection immediately preceding the asm printer. + addPass(createPPCBranchSelectionPass(), false); +} + +TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(PPCTTIImpl(this, F)); + }); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h new file mode 100644 index 0000000..6496339 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -0,0 +1,84 @@ +//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H +#define LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H + +#include "PPCInstrInfo.h" +#include "PPCSubtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets. +/// +class PPCTargetMachine : public LLVMTargetMachine { +public: + enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 }; +private: + std::unique_ptr<TargetLoweringObjectFile> TLOF; + PPCABI TargetABI; + PPCSubtarget Subtarget; + + mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap; + +public: + PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + ~PPCTargetMachine() override; + + const PPCSubtarget *getSubtargetImpl(const Function &F) const override; + + // Pass Pipeline Configuration + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + TargetIRAnalysis getTargetIRAnalysis() override; + + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } + bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; } + bool isPPC64() const { + const Triple &TT = getTargetTriple(); + return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); + }; +}; + +/// PPC32TargetMachine - PowerPC 32-bit target machine. +/// +class PPC32TargetMachine : public PPCTargetMachine { + virtual void anchor(); +public: + PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +/// PPC64TargetMachine - PowerPC 64-bit target machine. +/// +class PPC64TargetMachine : public PPCTargetMachine { + virtual void anchor(); +public: + PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp new file mode 100644 index 0000000..798bb9d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -0,0 +1,61 @@ +//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetObjectFile.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" + +using namespace llvm; + +void +PPC64LinuxTargetObjectFile:: +Initialize(MCContext &Ctx, const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const { + // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI + // when we have a constant that contains global relocations. This is + // necessary because of this ABI's handling of pointers to functions in + // a shared library. The address of a function is actually the address + // of a function descriptor, which resides in the .opd section. Generated + // code uses the descriptor directly rather than going via the GOT as some + // other ABIs do, which means that initialized function pointers must + // reference the descriptor. The linker must convert copy relocs of + // pointers to functions in shared libraries into dynamic relocations, + // because of an ordering problem with initialization of copy relocs and + // PLT entries. The dynamic relocation will be initialized by the dynamic + // linker, so we must use DataRelROSection instead of ReadOnlySection. + // For more information, see the description of ELIMINATE_COPY_RELOCS in + // GNU ld. + if (Kind.isReadOnly()) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + + if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation()) + Kind = SectionKind::getReadOnlyWithRel(); + } + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, + Mang, TM); +} + +const MCExpr *PPC64LinuxTargetObjectFile:: +getDebugThreadLocalSymbol(const MCSymbol *Sym) const { + const MCExpr *Expr = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext()); + return MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(0x8000, getContext()), + getContext()); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h new file mode 100644 index 0000000..d248791 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h @@ -0,0 +1,35 @@ +//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + + /// PPC64LinuxTargetObjectFile - This implementation is used for + /// 64-bit PowerPC Linux. + class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF { + + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; + + /// \brief Describe a TLS variable address within debug info. + const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; + }; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h new file mode 100644 index 0000000..dbe7617 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h @@ -0,0 +1,27 @@ +//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H +#define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { +class PPCTargetStreamer : public MCTargetStreamer { +public: + PPCTargetStreamer(MCStreamer &S); + ~PPCTargetStreamer() override; + virtual void emitTCEntry(const MCSymbol &S) = 0; + virtual void emitMachine(StringRef CPU) = 0; + virtual void emitAbiVersion(int AbiVersion) = 0; + virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp new file mode 100644 index 0000000..cd86dab --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -0,0 +1,413 @@ +//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetTransformInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "ppctti" + +static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", +cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); + +//===----------------------------------------------------------------------===// +// +// PPC cost model. +// +//===----------------------------------------------------------------------===// + +TargetTransformInfo::PopcntSupportKind +PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + if (ST->hasPOPCNTD() && TyWidth <= 64) + return TTI::PSK_FastHardware; + return TTI::PSK_Software; +} + +int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { + if (DisablePPCConstHoist) + return BaseT::getIntImmCost(Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + if (Imm == 0) + return TTI::TCC_Free; + + if (Imm.getBitWidth() <= 64) { + if (isInt<16>(Imm.getSExtValue())) + return TTI::TCC_Basic; + + if (isInt<32>(Imm.getSExtValue())) { + // A constant that can be materialized using lis. + if ((Imm.getZExtValue() & 0xFFFF) == 0) + return TTI::TCC_Basic; + + return 2 * TTI::TCC_Basic; + } + } + + return 4 * TTI::TCC_Basic; +} + +int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { + if (DisablePPCConstHoist) + return BaseT::getIntImmCost(IID, Idx, Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + switch (IID) { + default: + return TTI::TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) + return TTI::TCC_Free; + break; + case Intrinsic::experimental_stackmap: + if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + } + return PPCTTIImpl::getIntImmCost(Imm, Ty); +} + +int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { + if (DisablePPCConstHoist) + return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + unsigned ImmIdx = ~0U; + bool ShiftedFree = false, RunFree = false, UnsignedFree = false, + ZeroFree = false; + switch (Opcode) { + default: + return TTI::TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. This prevents the + // creation of new constants for every base constant that gets constant + // folded with the offset. + if (Idx == 0) + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; + case Instruction::And: + RunFree = true; // (for the rotate-and-mask instructions) + // Fallthrough... + case Instruction::Add: + case Instruction::Or: + case Instruction::Xor: + ShiftedFree = true; + // Fallthrough... + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + ImmIdx = 1; + break; + case Instruction::ICmp: + UnsignedFree = true; + ImmIdx = 1; + // Fallthrough... (zero comparisons can use record-form instructions) + case Instruction::Select: + ZeroFree = true; + break; + case Instruction::PHI: + case Instruction::Call: + case Instruction::Ret: + case Instruction::Load: + case Instruction::Store: + break; + } + + if (ZeroFree && Imm == 0) + return TTI::TCC_Free; + + if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { + if (isInt<16>(Imm.getSExtValue())) + return TTI::TCC_Free; + + if (RunFree) { + if (Imm.getBitWidth() <= 32 && + (isShiftedMask_32(Imm.getZExtValue()) || + isShiftedMask_32(~Imm.getZExtValue()))) + return TTI::TCC_Free; + + if (ST->isPPC64() && + (isShiftedMask_64(Imm.getZExtValue()) || + isShiftedMask_64(~Imm.getZExtValue()))) + return TTI::TCC_Free; + } + + if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) + return TTI::TCC_Free; + + if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) + return TTI::TCC_Free; + } + + return PPCTTIImpl::getIntImmCost(Imm, Ty); +} + +void PPCTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + if (ST->getDarwinDirective() == PPC::DIR_A2) { + // The A2 is in-order with a deep pipeline, and concatenation unrolling + // helps expose latency-hiding opportunities to the instruction scheduler. + UP.Partial = UP.Runtime = true; + + // We unroll a lot on the A2 (hundreds of instructions), and the benefits + // often outweigh the cost of a division to compute the trip count. + UP.AllowExpensiveTripCount = true; + } + + BaseT::getUnrollingPreferences(L, UP); +} + +bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + // On the A2, always unroll aggressively. For QPX unaligned loads, we depend + // on combining the loads generated for consecutive accesses, and failure to + // do so is particularly expensive. This makes it much more likely (compared + // to only using concatenation unrolling). + if (ST->getDarwinDirective() == PPC::DIR_A2) + return true; + + return LoopHasReductions; +} + +bool PPCTTIImpl::enableInterleavedAccessVectorization() { + return true; +} + +unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { + if (Vector && !ST->hasAltivec() && !ST->hasQPX()) + return 0; + return ST->hasVSX() ? 64 : 32; +} + +unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { + if (Vector) { + if (ST->hasQPX()) return 256; + if (ST->hasAltivec()) return 128; + return 0; + } + + if (ST->isPPC64()) + return 64; + return 32; + +} + +unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { + unsigned Directive = ST->getDarwinDirective(); + // The 440 has no SIMD support, but floating-point instructions + // have a 5-cycle latency, so unroll by 5x for latency hiding. + if (Directive == PPC::DIR_440) + return 5; + + // The A2 has no SIMD support, but floating-point instructions + // have a 6-cycle latency, so unroll by 6x for latency hiding. + if (Directive == PPC::DIR_A2) + return 6; + + // FIXME: For lack of any better information, do no harm... + if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) + return 1; + + // For P7 and P8, floating-point instructions have a 6-cycle latency and + // there are two execution units, so unroll by 12x for latency hiding. + if (Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8) + return 12; + + // For most things, modern systems have two execution units (and + // out-of-order execution). + return 2; +} + +int PPCTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + + // Fallback to the default implementation. + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); +} + +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). We need one such shuffle instruction for each actual + // register (this is not true for arbitrary shuffles, but is true for the + // structured types of shuffles covered by TTI::ShuffleKind). + return LT.first; +} + +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + assert(Val->isVectorTy() && "This must be a vector type"); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { + // Double-precision scalars are already located in index #0. + if (Index == 0) + return 0; + + return BaseT::getVectorInstrCost(Opcode, Val, Index); + } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { + // Floating point scalars are already located in index #0. + if (Index == 0) + return 0; + + return BaseT::getVectorInstrCost(Opcode, Val, Index); + } + + // Estimated cost of a load-hit-store delay. This was obtained + // experimentally as a minimum needed to prevent unprofitable + // vectorization for the paq8p benchmark. It may need to be + // raised further if other unprofitable cases remain. + unsigned LHSPenalty = 2; + if (ISD == ISD::INSERT_VECTOR_ELT) + LHSPenalty += 7; + + // Vector element insert/extract with Altivec is very expensive, + // because they require store and reload with the attendant + // processor stall for load-hit-store. Until VSX is available, + // these need to be estimated as very costly. + if (ISD == ISD::EXTRACT_VECTOR_ELT || + ISD == ISD::INSERT_VECTOR_ELT) + return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); + + return BaseT::getVectorInstrCost(Opcode, Val, Index); +} + +int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + + // Aligned loads and stores are easy. + unsigned SrcBytes = LT.second.getStoreSize(); + if (!SrcBytes || !Alignment || Alignment >= SrcBytes) + return Cost; + + bool IsAltivecType = ST->hasAltivec() && + (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || + LT.second == MVT::v4i32 || LT.second == MVT::v4f32); + bool IsVSXType = ST->hasVSX() && + (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); + bool IsQPXType = ST->hasQPX() && + (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + + // If we can use the permutation-based load sequence, then this is also + // relatively cheap (not counting loop-invariant instructions): one load plus + // one permute (the last load in a series has extra cost, but we're + // neglecting that here). Note that on the P7, we should do unaligned loads + // for Altivec types using the VSX instructions, but that's more expensive + // than using the permutation-based load sequence. On the P8, that's no + // longer true. + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + Alignment >= LT.second.getScalarType().getStoreSize()) + return Cost + LT.first; // Add the cost of the permutations. + + // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the + // P7, unaligned vector loads are more expensive than the permutation-based + // load sequence, so that might be used instead, but regardless, the net cost + // is about the same (not counting loop-invariant instructions). + if (IsVSXType || (ST->hasVSX() && IsAltivecType)) + return Cost; + + // PPC in general does not support unaligned loads and stores. They'll need + // to be decomposed based on the alignment factor. + + // Add the cost of each scalar load or store. + Cost += LT.first*(SrcBytes/Alignment-1); + + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + + return Cost; +} + +int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa<VectorType>(VecTy) && + "Expect a vector type for interleaved memory op"); + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); + + // Firstly, the cost of load/store operation. + int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). For each result vector, we need one shuffle per incoming + // vector (except that the first shuffle can take two incoming vectors + // because it does not need to take itself). + Cost += Factor*(LT.first-1); + + return Cost; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h new file mode 100644 index 0000000..04c1b02 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -0,0 +1,97 @@ +//===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// PPC target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H + +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> { + typedef BasicTTIImplBase<PPCTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const PPCSubtarget *ST; + const PPCTargetLowering *TLI; + + const PPCSubtarget *getST() const { return ST; } + const PPCTargetLowering *getTLI() const { return TLI; } + +public: + explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + PPCTTIImpl(const PPCTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + PPCTTIImpl(PPCTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + /// \name Scalar TTI Implementations + /// @{ + + using BaseT::getIntImmCost; + int getIntImmCost(const APInt &Imm, Type *Ty); + + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + bool enableAggressiveInterleaving(bool LoopHasReductions); + bool enableInterleavedAccessVectorization(); + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace); + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp new file mode 100644 index 0000000..782583c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -0,0 +1,188 @@ +//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A pass which deals with the complexity of generating legal VSX register +// copies to/from register classes which partially overlap with the VSX +// register file. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCHazardRecognizers.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-vsx-copy" + +namespace llvm { + void initializePPCVSXCopyPass(PassRegistry&); +} + +namespace { + // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers + // (Altivec and scalar floating-point registers), we need to transform the + // copies into subregister copies with other restrictions. + struct PPCVSXCopy : public MachineFunctionPass { + static char ID; + PPCVSXCopy() : MachineFunctionPass(ID) { + initializePPCVSXCopyPass(*PassRegistry::getPassRegistry()); + } + + const TargetInstrInfo *TII; + + bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC, + MachineRegisterInfo &MRI) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->hasSubClassEq(MRI.getRegClass(Reg)); + } else if (RC->contains(Reg)) { + return true; + } + + return false; + } + + bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI); + } + + bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI); + } + + bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); + } + + bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI); + } + + bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI); + } + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) { + MachineInstr *MI = I; + if (!MI->isFullCopy()) + continue; + + MachineOperand &DstMO = MI->getOperand(0); + MachineOperand &SrcMO = MI->getOperand(1); + + if ( IsVSReg(DstMO.getReg(), MRI) && + !IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *to* a VSX register from a non-VSX register. + Changed = true; + + const TargetRegisterClass *SrcRC = + IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass : + &PPC::VSLRCRegClass; + assert((IsF8Reg(SrcMO.getReg(), MRI) || + IsVRReg(SrcMO.getReg(), MRI) || + IsVSSReg(SrcMO.getReg(), MRI) || + IsVSFReg(SrcMO.getReg(), MRI)) && + "Unknown source for a VSX copy"); + + unsigned NewVReg = MRI.createVirtualRegister(SrcRC); + BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) + .addImm(1) // add 1, not 0, because there is no implicit clearing + // of the high bits. + .addOperand(SrcMO) + .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 : + PPC::sub_64); + + // The source of the original copy is now the new virtual register. + SrcMO.setReg(NewVReg); + } else if (!IsVSReg(DstMO.getReg(), MRI) && + IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *from* a VSX register to a non-VSX register. + Changed = true; + + const TargetRegisterClass *DstRC = + IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass : + &PPC::VSLRCRegClass; + assert((IsF8Reg(DstMO.getReg(), MRI) || + IsVSFReg(DstMO.getReg(), MRI) || + IsVSSReg(DstMO.getReg(), MRI) || + IsVRReg(DstMO.getReg(), MRI)) && + "Unknown destination for a VSX copy"); + + // Copy the VSX value into a new VSX register of the correct subclass. + unsigned NewVReg = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), NewVReg) + .addOperand(SrcMO); + + // Transform the original copy into a subregister extraction copy. + SrcMO.setReg(NewVReg); + SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 : + PPC::sub_64); + } + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX on the subtarget, don't do anything. + const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); + if (!STI.hasVSX()) + return false; + TII = STI.getInstrInfo(); + + bool Changed = false; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE, + "PowerPC VSX Copy Legalization", false, false) + +char PPCVSXCopy::ID = 0; +FunctionPass* +llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp new file mode 100644 index 0000000..6b19a2f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -0,0 +1,372 @@ +//===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass mutates the form of VSX FMA instructions to avoid unnecessary +// copies. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation", +cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden); + +#define DEBUG_TYPE "ppc-vsx-fma-mutate" + +namespace llvm { namespace PPC { + int getAltVSXFMAOpcode(uint16_t Opcode); +} } + +namespace { + // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers + // (Altivec and scalar floating-point registers), we need to transform the + // copies into subregister copies with other restrictions. + struct PPCVSXFMAMutate : public MachineFunctionPass { + static char ID; + PPCVSXFMAMutate() : MachineFunctionPass(ID) { + initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); + } + + LiveIntervals *LIS; + const PPCInstrInfo *TII; + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) { + MachineInstr *MI = I; + + // The default (A-type) VSX FMA form kills the addend (it is taken from + // the target register, which is then updated to reflect the result of + // the FMA). If the instruction, however, kills one of the registers + // used for the product, then we can use the M-form instruction (which + // will take that value from the to-be-defined register). + + int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode()); + if (AltOpc == -1) + continue; + + // This pass is run after register coalescing, and so we're looking for + // a situation like this: + // ... + // %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 + // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, + // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 + // ... + // %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19, + // %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19 + // ... + // Where we can eliminate the copy by changing from the A-type to the + // M-type instruction. Specifically, for this example, this means: + // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, + // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 + // is replaced by: + // %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9, + // %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9 + // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 + + SlotIndex FMAIdx = LIS->getInstructionIndex(MI); + + VNInfo *AddendValNo = + LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn(); + + // This can be null if the register is undef. + if (!AddendValNo) + continue; + + MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def); + + // The addend and this instruction must be in the same block. + + if (!AddendMI || AddendMI->getParent() != MI->getParent()) + continue; + + // The addend must be a full copy within the same register class. + + if (!AddendMI->isFullCopy()) + continue; + + unsigned AddendSrcReg = AddendMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) { + if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) != + MRI.getRegClass(AddendSrcReg)) + continue; + } else { + // If AddendSrcReg is a physical register, make sure the destination + // register class contains it. + if (!MRI.getRegClass(AddendMI->getOperand(0).getReg()) + ->contains(AddendSrcReg)) + continue; + } + + // In theory, there could be other uses of the addend copy before this + // fma. We could deal with this, but that would require additional + // logic below and I suspect it will not occur in any relevant + // situations. Additionally, check whether the copy source is killed + // prior to the fma. In order to replace the addend here with the + // source of the copy, it must still be live here. We can't use + // interval testing for a physical register, so as long as we're + // walking the MIs we may as well test liveness here. + // + // FIXME: There is a case that occurs in practice, like this: + // %vreg9<def> = COPY %F1; VSSRC:%vreg9 + // ... + // %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9 + // %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9 + // %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC: + // %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC: + // %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC: + // which prevents an otherwise-profitable transformation. + bool OtherUsers = false, KillsAddendSrc = false; + for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI); + J != JE; --J) { + if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) { + OtherUsers = true; + break; + } + if (J->modifiesRegister(AddendSrcReg, TRI) || + J->killsRegister(AddendSrcReg, TRI)) { + KillsAddendSrc = true; + break; + } + } + + if (OtherUsers || KillsAddendSrc) + continue; + + // Find one of the product operands that is killed by this instruction. + + unsigned KilledProdOp = 0, OtherProdOp = 0; + if (LIS->getInterval(MI->getOperand(2).getReg()) + .Query(FMAIdx).isKill()) { + KilledProdOp = 2; + OtherProdOp = 3; + } else if (LIS->getInterval(MI->getOperand(3).getReg()) + .Query(FMAIdx).isKill()) { + KilledProdOp = 3; + OtherProdOp = 2; + } + + // If there are no killed product operands, then this transformation is + // likely not profitable. + if (!KilledProdOp) + continue; + + // If the addend copy is used only by this MI, then the addend source + // register is likely not live here. This could be fixed (based on the + // legality checks above, the live range for the addend source register + // could be extended), but it seems likely that such a trivial copy can + // be coalesced away later, and thus is not worth the effort. + if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && + !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) + continue; + + // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3. + + unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg(); + unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg(); + + unsigned AddSubReg = AddendMI->getOperand(1).getSubReg(); + unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg(); + unsigned OtherProdSubReg = MI->getOperand(OtherProdOp).getSubReg(); + + bool AddRegKill = AddendMI->getOperand(1).isKill(); + bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill(); + bool OtherProdRegKill = MI->getOperand(OtherProdOp).isKill(); + + bool AddRegUndef = AddendMI->getOperand(1).isUndef(); + bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef(); + bool OtherProdRegUndef = MI->getOperand(OtherProdOp).isUndef(); + + unsigned OldFMAReg = MI->getOperand(0).getReg(); + + // The transformation doesn't work well with things like: + // %vreg5 = A-form-op %vreg5, %vreg11, %vreg5; + // so leave such things alone. + if (OldFMAReg == KilledProdReg) + continue; + + // If there isn't a class that fits, we can't perform the transform. + // This is needed for correctness with a mixture of VSX and Altivec + // instructions to make sure that a low VSX register is not assigned to + // the Altivec instruction. + if (!MRI.constrainRegClass(KilledProdReg, + MRI.getRegClass(OldFMAReg))) + continue; + + assert(OldFMAReg == AddendMI->getOperand(0).getReg() && + "Addend copy not tied to old FMA output!"); + + DEBUG(dbgs() << "VSX FMA Mutation:\n " << *MI;); + + MI->getOperand(0).setReg(KilledProdReg); + MI->getOperand(1).setReg(KilledProdReg); + MI->getOperand(3).setReg(AddendSrcReg); + MI->getOperand(2).setReg(OtherProdReg); + + MI->getOperand(0).setSubReg(KilledProdSubReg); + MI->getOperand(1).setSubReg(KilledProdSubReg); + MI->getOperand(3).setSubReg(AddSubReg); + MI->getOperand(2).setSubReg(OtherProdSubReg); + + MI->getOperand(1).setIsKill(KilledProdRegKill); + MI->getOperand(3).setIsKill(AddRegKill); + MI->getOperand(2).setIsKill(OtherProdRegKill); + + MI->getOperand(1).setIsUndef(KilledProdRegUndef); + MI->getOperand(3).setIsUndef(AddRegUndef); + MI->getOperand(2).setIsUndef(OtherProdRegUndef); + + MI->setDesc(TII->get(AltOpc)); + + DEBUG(dbgs() << " -> " << *MI); + + // The killed product operand was killed here, so we can reuse it now + // for the result of the fma. + + LiveInterval &FMAInt = LIS->getInterval(OldFMAReg); + VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot()); + for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end(); + UI != UE;) { + MachineOperand &UseMO = *UI; + MachineInstr *UseMI = UseMO.getParent(); + ++UI; + + // Don't replace the result register of the copy we're about to erase. + if (UseMI == AddendMI) + continue; + + UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI); + } + + // Extend the live intervals of the killed product operand to hold the + // fma result. + + LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg); + for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end(); + AI != AE; ++AI) { + // Don't add the segment that corresponds to the original copy. + if (AI->valno == AddendValNo) + continue; + + VNInfo *NewFMAValNo = + NewFMAInt.getNextValue(AI->start, + LIS->getVNInfoAllocator()); + + NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end, + NewFMAValNo)); + } + DEBUG(dbgs() << " extended: " << NewFMAInt << '\n'); + + // Extend the live interval of the addend source (it might end at the + // copy to be removed, or somewhere in between there and here). This + // is necessary only if it is a physical register. + if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) + for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid(); + ++Units) { + unsigned Unit = *Units; + + LiveRange &AddendSrcRange = LIS->getRegUnit(Unit); + AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB), + FMAIdx.getRegSlot()); + DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n'); + } + + FMAInt.removeValNo(FMAValNo); + DEBUG(dbgs() << " trimmed: " << FMAInt << '\n'); + + // Remove the (now unused) copy. + + DEBUG(dbgs() << " removing: " << *AddendMI << '\n'); + LIS->RemoveMachineInstrFromMaps(AddendMI); + AddendMI->eraseFromParent(); + + Changed = true; + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX then go ahead and return without doing + // anything. + const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); + if (!STI.hasVSX()) + return false; + + LIS = &getAnalysis<LiveIntervals>(); + + TII = STI.getInstrInfo(); + + bool Changed = false; + + if (DisableVSXFMAMutate) + return Changed; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addRequired<SlotIndexes>(); + AU.addPreserved<SlotIndexes>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE, + "PowerPC VSX FMA Mutation", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, + "PowerPC VSX FMA Mutation", false, false) + +char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID; + +char PPCVSXFMAMutate::ID = 0; +FunctionPass *llvm::createPPCVSXFMAMutatePass() { + return new PPCVSXFMAMutate(); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp new file mode 100644 index 0000000..27c540f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -0,0 +1,1003 @@ +//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass analyzes vector computations and removes unnecessary +// doubleword swaps (xxswapd instructions). This pass is performed +// only for little-endian VSX code generation. +// +// For this specific case, loads and stores of v4i32, v4f32, v2i64, +// and v2f64 vectors are inefficient. These are implemented using +// the lxvd2x and stxvd2x instructions, which invert the order of +// doublewords in a vector register. Thus code generation inserts +// an xxswapd after each such load, and prior to each such store. +// +// The extra xxswapd instructions reduce performance. The purpose +// of this pass is to reduce the number of xxswapd instructions +// required for correctness. +// +// The primary insight is that much code that operates on vectors +// does not care about the relative order of elements in a register, +// so long as the correct memory order is preserved. If we have a +// computation where all input values are provided by lxvd2x/xxswapd, +// all outputs are stored using xxswapd/lxvd2x, and all intermediate +// computations are lane-insensitive (independent of element order), +// then all the xxswapd instructions associated with the loads and +// stores may be removed without changing observable semantics. +// +// This pass uses standard equivalence class infrastructure to create +// maximal webs of computations fitting the above description. Each +// such web is then optimized by removing its unnecessary xxswapd +// instructions. +// +// There are some lane-sensitive operations for which we can still +// permit the optimization, provided we modify those operations +// accordingly. Such operations are identified as using "special +// handling" within this module. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-vsx-swaps" + +namespace llvm { + void initializePPCVSXSwapRemovalPass(PassRegistry&); +} + +namespace { + +// A PPCVSXSwapEntry is created for each machine instruction that +// is relevant to a vector computation. +struct PPCVSXSwapEntry { + // Pointer to the instruction. + MachineInstr *VSEMI; + + // Unique ID (position in the swap vector). + int VSEId; + + // Attributes of this node. + unsigned int IsLoad : 1; + unsigned int IsStore : 1; + unsigned int IsSwap : 1; + unsigned int MentionsPhysVR : 1; + unsigned int IsSwappable : 1; + unsigned int MentionsPartialVR : 1; + unsigned int SpecialHandling : 3; + unsigned int WebRejected : 1; + unsigned int WillRemove : 1; +}; + +enum SHValues { + SH_NONE = 0, + SH_EXTRACT, + SH_INSERT, + SH_NOSWAP_LD, + SH_NOSWAP_ST, + SH_SPLAT, + SH_XXPERMDI, + SH_COPYWIDEN +}; + +struct PPCVSXSwapRemoval : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + // Swap entries are allocated in a vector for better performance. + std::vector<PPCVSXSwapEntry> SwapVector; + + // A mapping is maintained between machine instructions and + // their swap entries. The key is the address of the MI. + DenseMap<MachineInstr*, int> SwapMap; + + // Equivalence classes are used to gather webs of related computation. + // Swap entries are represented by their VSEId fields. + EquivalenceClasses<int> *EC; + + PPCVSXSwapRemoval() : MachineFunctionPass(ID) { + initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize data structures. + void initialize(MachineFunction &MFParm); + + // Walk the machine instructions to gather vector usage information. + // Return true iff vector mentions are present. + bool gatherVectorInstructions(); + + // Add an entry to the swap vector and swap map. + int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry); + + // Hunt backwards through COPY and SUBREG_TO_REG chains for a + // source register. VecIdx indicates the swap vector entry to + // mark as mentioning a physical register if the search leads + // to one. + unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx); + + // Generate equivalence classes for related computations (webs). + void formWebs(); + + // Analyze webs and determine those that cannot be optimized. + void recordUnoptimizableWebs(); + + // Record which swap instructions can be safely removed. + void markSwapsForRemoval(); + + // Remove swaps and update other instructions requiring special + // handling. Return true iff any changes are made. + bool removeSwaps(); + + // Insert a swap instruction from SrcReg to DstReg at the given + // InsertPoint. + void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg); + + // Update instructions requiring special handling. + void handleSpecialSwappables(int EntryIdx); + + // Dump a description of the entries in the swap vector. + void dumpSwapVector(); + + // Return true iff the given register is in the given class. + bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return RC->hasSubClassEq(MRI->getRegClass(Reg)); + return RC->contains(Reg); + } + + // Return true iff the given register is a full vector register. + bool isVecReg(unsigned Reg) { + return (isRegInClass(Reg, &PPC::VSRCRegClass) || + isRegInClass(Reg, &PPC::VRRCRegClass)); + } + + // Return true iff the given register is a partial vector register. + bool isScalarVecReg(unsigned Reg) { + return (isRegInClass(Reg, &PPC::VSFRCRegClass) || + isRegInClass(Reg, &PPC::VSSRCRegClass)); + } + + // Return true iff the given register mentions all or part of a + // vector register. Also sets Partial to true if the mention + // is for just the floating-point register overlap of the register. + bool isAnyVecReg(unsigned Reg, bool &Partial) { + if (isScalarVecReg(Reg)) + Partial = true; + return isScalarVecReg(Reg) || isVecReg(Reg); + } + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX on the subtarget, don't do anything. + const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); + if (!STI.hasVSX()) + return false; + + bool Changed = false; + initialize(MF); + + if (gatherVectorInstructions()) { + formWebs(); + recordUnoptimizableWebs(); + markSwapsForRemoval(); + Changed = removeSwaps(); + } + + // FIXME: See the allocation of EC in initialize(). + delete EC; + return Changed; + } +}; + +// Initialize data structures for this pass. In particular, clear the +// swap vector and allocate the equivalence class mapping before +// processing each function. +void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); + + // An initial vector size of 256 appears to work well in practice. + // Small/medium functions with vector content tend not to incur a + // reallocation at this size. Three of the vector tests in + // projects/test-suite reallocate, which seems like a reasonable rate. + const int InitialVectorSize(256); + SwapVector.clear(); + SwapVector.reserve(InitialVectorSize); + + // FIXME: Currently we allocate EC each time because we don't have + // access to the set representation on which to call clear(). Should + // consider adding a clear() method to the EquivalenceClasses class. + EC = new EquivalenceClasses<int>; +} + +// Create an entry in the swap vector for each instruction that mentions +// a full vector register, recording various characteristics of the +// instructions there. +bool PPCVSXSwapRemoval::gatherVectorInstructions() { + bool RelevantFunction = false; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + if (MI.isDebugValue()) + continue; + + bool RelevantInstr = false; + bool Partial = false; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (isAnyVecReg(Reg, Partial)) { + RelevantInstr = true; + break; + } + } + + if (!RelevantInstr) + continue; + + RelevantFunction = true; + + // Create a SwapEntry initialized to zeros, then fill in the + // instruction and ID fields before pushing it to the back + // of the swap vector. + PPCVSXSwapEntry SwapEntry{}; + int VecIdx = addSwapEntry(&MI, SwapEntry); + + switch(MI.getOpcode()) { + default: + // Unless noted otherwise, an instruction is considered + // safe for the optimization. There are a large number of + // such true-SIMD instructions (all vector math, logical, + // select, compare, etc.). However, if the instruction + // mentions a partial vector register and does not have + // special handling defined, it is not swappable. + if (Partial) + SwapVector[VecIdx].MentionsPartialVR = 1; + else + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::XXPERMDI: { + // This is a swap if it is of the form XXPERMDI t, s, s, 2. + // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we + // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2, + // for example. We have to look through chains of COPY and + // SUBREG_TO_REG to find the real source value for comparison. + // If the real source value is a physical register, then mark the + // XXPERMDI as mentioning a physical register. + int immed = MI.getOperand(3).getImm(); + if (immed == 2) { + unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(), + VecIdx); + unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(), + VecIdx); + if (trueReg1 == trueReg2) + SwapVector[VecIdx].IsSwap = 1; + else { + // We can still handle these if the two registers are not + // identical, by adjusting the form of the XXPERMDI. + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; + } + // This is a doubleword splat if it is of the form + // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we + // must look through chains of copy-likes to find the source + // register. We turn off the marking for mention of a physical + // register, because splatting it is safe; the optimization + // will not swap the value in the physical register. Whether + // or not the two input registers are identical, we can handle + // these by adjusting the form of the XXPERMDI. + } else if (immed == 0 || immed == 3) { + + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; + + unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(), + VecIdx); + unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(), + VecIdx); + if (trueReg1 == trueReg2) + SwapVector[VecIdx].MentionsPhysVR = 0; + + } else { + // We can still handle these by adjusting the form of the XXPERMDI. + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; + } + break; + } + case PPC::LVX: + // Non-permuting loads are currently unsafe. We can use special + // handling for this in the future. By not marking these as + // IsSwap, we ensure computations containing them will be rejected + // for now. + SwapVector[VecIdx].IsLoad = 1; + break; + case PPC::LXVD2X: + case PPC::LXVW4X: + // Permuting loads are marked as both load and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::LXSDX: + case PPC::LXSSPX: + // A load of a floating-point value into the high-order half of + // a vector register is safe, provided that we introduce a swap + // following the load, which will be done by the SUBREG_TO_REG + // support. So just mark these as safe. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::STVX: + // Non-permuting stores are currently unsafe. We can use special + // handling for this in the future. By not marking these as + // IsSwap, we ensure computations containing them will be rejected + // for now. + SwapVector[VecIdx].IsStore = 1; + break; + case PPC::STXVD2X: + case PPC::STXVW4X: + // Permuting stores are marked as both store and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsStore = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::COPY: + // These are fine provided they are moving between full vector + // register classes. + if (isVecReg(MI.getOperand(0).getReg()) && + isVecReg(MI.getOperand(1).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + // If we have a copy from one scalar floating-point register + // to another, we can accept this even if it is a physical + // register. The only way this gets involved is if it feeds + // a SUBREG_TO_REG, which is handled by introducing a swap. + else if (isScalarVecReg(MI.getOperand(0).getReg()) && + isScalarVecReg(MI.getOperand(1).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::SUBREG_TO_REG: { + // These are fine provided they are moving between full vector + // register classes. If they are moving from a scalar + // floating-point class to a vector class, we can handle those + // as well, provided we introduce a swap. It is generally the + // case that we will introduce fewer swaps than we remove, but + // (FIXME) a cost model could be used. However, introduced + // swaps could potentially be CSEd, so this is not trivial. + if (isVecReg(MI.getOperand(0).getReg()) && + isVecReg(MI.getOperand(2).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + else if (isVecReg(MI.getOperand(0).getReg()) && + isScalarVecReg(MI.getOperand(2).getReg())) { + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN; + } + break; + } + case PPC::VSPLTB: + case PPC::VSPLTH: + case PPC::VSPLTW: + // Splats are lane-sensitive, but we can use special handling + // to adjust the source lane for the splat. This is not yet + // implemented. When it is, we need to uncomment the following: + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT; + break; + // The presence of the following lane-sensitive operations in a + // web will kill the optimization, at least for now. For these + // we do nothing, causing the optimization to fail. + // FIXME: Some of these could be permitted with special handling, + // and will be phased in as time permits. + // FIXME: There is no simple and maintainable way to express a set + // of opcodes having a common attribute in TableGen. Should this + // change, this is a prime candidate to use such a mechanism. + case PPC::INLINEASM: + case PPC::EXTRACT_SUBREG: + case PPC::INSERT_SUBREG: + case PPC::COPY_TO_REGCLASS: + case PPC::LVEBX: + case PPC::LVEHX: + case PPC::LVEWX: + case PPC::LVSL: + case PPC::LVSR: + case PPC::LVXL: + case PPC::STVEBX: + case PPC::STVEHX: + case PPC::STVEWX: + case PPC::STVXL: + // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX, + // by adding special handling for narrowing copies as well as + // widening ones. However, I've experimented with this, and in + // practice we currently do not appear to use STXSDX fed by + // a narrowing copy from a full vector register. Since I can't + // generate any useful test cases, I've left this alone for now. + case PPC::STXSDX: + case PPC::STXSSPX: + case PPC::VCIPHER: + case PPC::VCIPHERLAST: + case PPC::VMRGHB: + case PPC::VMRGHH: + case PPC::VMRGHW: + case PPC::VMRGLB: + case PPC::VMRGLH: + case PPC::VMRGLW: + case PPC::VMULESB: + case PPC::VMULESH: + case PPC::VMULESW: + case PPC::VMULEUB: + case PPC::VMULEUH: + case PPC::VMULEUW: + case PPC::VMULOSB: + case PPC::VMULOSH: + case PPC::VMULOSW: + case PPC::VMULOUB: + case PPC::VMULOUH: + case PPC::VMULOUW: + case PPC::VNCIPHER: + case PPC::VNCIPHERLAST: + case PPC::VPERM: + case PPC::VPERMXOR: + case PPC::VPKPX: + case PPC::VPKSHSS: + case PPC::VPKSHUS: + case PPC::VPKSDSS: + case PPC::VPKSDUS: + case PPC::VPKSWSS: + case PPC::VPKSWUS: + case PPC::VPKUDUM: + case PPC::VPKUDUS: + case PPC::VPKUHUM: + case PPC::VPKUHUS: + case PPC::VPKUWUM: + case PPC::VPKUWUS: + case PPC::VPMSUMB: + case PPC::VPMSUMD: + case PPC::VPMSUMH: + case PPC::VPMSUMW: + case PPC::VRLB: + case PPC::VRLD: + case PPC::VRLH: + case PPC::VRLW: + case PPC::VSBOX: + case PPC::VSHASIGMAD: + case PPC::VSHASIGMAW: + case PPC::VSL: + case PPC::VSLDOI: + case PPC::VSLO: + case PPC::VSR: + case PPC::VSRO: + case PPC::VSUM2SWS: + case PPC::VSUM4SBS: + case PPC::VSUM4SHS: + case PPC::VSUM4UBS: + case PPC::VSUMSWS: + case PPC::VUPKHPX: + case PPC::VUPKHSB: + case PPC::VUPKHSH: + case PPC::VUPKHSW: + case PPC::VUPKLPX: + case PPC::VUPKLSB: + case PPC::VUPKLSH: + case PPC::VUPKLSW: + case PPC::XXMRGHW: + case PPC::XXMRGLW: + // XXSLDWI could be replaced by a general permute with one of three + // permute control vectors (for shift values 1, 2, 3). However, + // VPERM has a more restrictive register class. + case PPC::XXSLDWI: + case PPC::XXSPLTW: + break; + } + } + } + + if (RelevantFunction) { + DEBUG(dbgs() << "Swap vector when first built\n\n"); + dumpSwapVector(); + } + + return RelevantFunction; +} + +// Add an entry to the swap vector and swap map, and make a +// singleton equivalence class for the entry. +int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI, + PPCVSXSwapEntry& SwapEntry) { + SwapEntry.VSEMI = MI; + SwapEntry.VSEId = SwapVector.size(); + SwapVector.push_back(SwapEntry); + EC->insert(SwapEntry.VSEId); + SwapMap[MI] = SwapEntry.VSEId; + return SwapEntry.VSEId; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search and +// flag the swap entry indicated by VecIdx (the original +// XXPERMDI) as mentioning a physical register. +unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, + unsigned VecIdx) { + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { + if (!isScalarVecReg(CopySrcReg)) + SwapVector[VecIdx].MentionsPhysVR = 1; + return CopySrcReg; + } + + return lookThruCopyLike(CopySrcReg, VecIdx); +} + +// Generate equivalence classes for related computations (webs) by +// def-use relationships of virtual registers. Mention of a physical +// register terminates the generation of equivalence classes as this +// indicates a use of a parameter, definition of a return value, use +// of a value returned from a call, or definition of a parameter to a +// call. Computations with physical register mentions are flagged +// as such so their containing webs will not be optimized. +void PPCVSXSwapRemoval::formWebs() { + + DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " "); + DEBUG(MI->dump()); + + // It's sufficient to walk vector uses and join them to their unique + // definitions. In addition, check full vector register operands + // for physical regs. We exclude partial-vector register operands + // because we can handle them if copied to a full vector. + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + if (!isVecReg(Reg) && !isScalarVecReg(Reg)) + continue; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!(MI->isCopy() && isScalarVecReg(Reg))) + SwapVector[EntryIdx].MentionsPhysVR = 1; + continue; + } + + if (!MO.isUse()) + continue; + + MachineInstr* DefMI = MRI->getVRegDef(Reg); + assert(SwapMap.find(DefMI) != SwapMap.end() && + "Inconsistency: def of vector reg not found in swap map!"); + int DefIdx = SwapMap[DefMI]; + (void)EC->unionSets(SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId); + + DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId)); + DEBUG(dbgs() << " Def: "); + DEBUG(DefMI->dump()); + } + } +} + +// Walk the swap vector entries looking for conditions that prevent their +// containing computations from being optimized. When such conditions are +// found, mark the representative of the computation's equivalence class +// as rejected. +void PPCVSXSwapRemoval::recordUnoptimizableWebs() { + + DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + // If representative is already rejected, don't waste further time. + if (SwapVector[Repr].WebRejected) + continue; + + // Reject webs containing mentions of physical or partial registers, or + // containing operations that we don't know how to handle in a lane- + // permuted region. + if (SwapVector[EntryIdx].MentionsPhysVR || + SwapVector[EntryIdx].MentionsPartialVR || + !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for physreg, partial reg, or not " + "swap[pable]\n", Repr)); + DEBUG(dbgs() << " in " << EntryIdx << ": "); + DEBUG(SwapVector[EntryIdx].VSEMI->dump()); + DEBUG(dbgs() << "\n"); + } + + // Reject webs than contain swapping loads that feed something other + // than a swap instruction. + else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + // We skip debug instructions in the analysis. (Note that debug + // location information is still maintained by this optimization + // because it remains on the LXVD2X and STXVD2X instructions after + // the XXPERMDIs are removed.) + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + + if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad || + SwapVector[UseIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for load not feeding swap\n", Repr)); + DEBUG(dbgs() << " def " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << " use " << UseIdx << ": "); + DEBUG(UseMI.dump()); + DEBUG(dbgs() << "\n"); + } + } + + // Reject webs that contain swapping stores that are fed by something + // other than a swap instruction. + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + + if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad || + SwapVector[DefIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for store not fed by swap\n", Repr)); + DEBUG(dbgs() << " def " << DefIdx << ": "); + DEBUG(DefMI->dump()); + DEBUG(dbgs() << " use " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << "\n"); + } + } + } + + DEBUG(dbgs() << "Swap vector after web analysis:\n\n"); + dumpSwapVector(); +} + +// Walk the swap vector entries looking for swaps fed by permuting loads +// and swaps that feed permuting stores. If the containing computation +// has not been marked rejected, mark each such swap for removal. +// (Removal is delayed in case optimization has disturbed the pattern, +// such that multiple loads feed the same swap, etc.) +void PPCVSXSwapRemoval::markSwapsForRemoval() { + + DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + SwapVector[UseIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap fed by load for removal: "); + DEBUG(UseMI.dump()); + } + } + + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + SwapVector[DefIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap feeding store for removal: "); + DEBUG(DefMI->dump()); + } + + } else if (SwapVector[EntryIdx].IsSwappable && + SwapVector[EntryIdx].SpecialHandling != 0) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) + handleSpecialSwappables(EntryIdx); + } + } +} + +// Create an xxswapd instruction and insert it prior to the given point. +// MI is used to determine basic block and debug loc information. +// FIXME: When inserting a swap, we should check whether SrcReg is +// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so, +// then instead we should generate a copy from Reg to DstReg. +void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI, + MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg) { + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), DstReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addImm(2); +} + +// The identified swap entry requires special handling to allow its +// containing computation to be optimized. Perform that handling +// here. +// FIXME: Additional opportunities will be phased in with subsequent +// patches. +void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { + switch (SwapVector[EntryIdx].SpecialHandling) { + + default: + llvm_unreachable("Unexpected special handling type"); + + // For splats based on an index into a vector, add N/2 modulo N + // to the index, where N is the number of vector elements. + case SHValues::SH_SPLAT: { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned NElts; + + DEBUG(dbgs() << "Changing splat: "); + DEBUG(MI->dump()); + + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected splat opcode"); + case PPC::VSPLTB: NElts = 16; break; + case PPC::VSPLTH: NElts = 8; break; + case PPC::VSPLTW: NElts = 4; break; + } + + unsigned EltNo = MI->getOperand(1).getImm(); + EltNo = (EltNo + NElts / 2) % NElts; + MI->getOperand(1).setImm(EltNo); + + DEBUG(dbgs() << " Into: "); + DEBUG(MI->dump()); + break; + } + + // For an XXPERMDI that isn't handled otherwise, we need to + // reverse the order of the operands. If the selector operand + // has a value of 0 or 3, we need to change it to 3 or 0, + // respectively. Otherwise we should leave it alone. (This + // is equivalent to reversing the two bits of the selector + // operand and complementing the result.) + case SHValues::SH_XXPERMDI: { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "Changing XXPERMDI: "); + DEBUG(MI->dump()); + + unsigned Selector = MI->getOperand(3).getImm(); + if (Selector == 0 || Selector == 3) + Selector = 3 - Selector; + MI->getOperand(3).setImm(Selector); + + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + MI->getOperand(1).setReg(Reg2); + MI->getOperand(2).setReg(Reg1); + + DEBUG(dbgs() << " Into: "); + DEBUG(MI->dump()); + break; + } + + // For a copy from a scalar floating-point register to a vector + // register, removing swaps will leave the copied value in the + // wrong lane. Insert a swap following the copy to fix this. + case SHValues::SH_COPYWIDEN: { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); + DEBUG(MI->dump()); + + unsigned DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + unsigned NewVReg = MRI->createVirtualRegister(DstRC); + + MI->getOperand(0).setReg(NewVReg); + DEBUG(dbgs() << " Into: "); + DEBUG(MI->dump()); + + auto InsertPoint = ++MachineBasicBlock::iterator(MI); + + // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG + // is copying to a VRRC, we need to be careful to avoid a register + // assignment problem. In this case we must copy from VRRC to VSRC + // prior to the swap, and from VSRC to VRRC following the swap. + // Coalescing will usually remove all this mess. + if (DstRC == &PPC::VRRCRegClass) { + unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::COPY), VSRCTmp1) + .addReg(NewVReg); + DEBUG(std::prev(InsertPoint)->dump()); + + insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1); + DEBUG(std::prev(InsertPoint)->dump()); + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::COPY), DstReg) + .addReg(VSRCTmp2); + DEBUG(std::prev(InsertPoint)->dump()); + + } else { + insertSwap(MI, InsertPoint, DstReg, NewVReg); + DEBUG(std::prev(InsertPoint)->dump()); + } + break; + } + } +} + +// Walk the swap vector and replace each entry marked for removal with +// a copy operation. +bool PPCVSXSwapRemoval::removeSwaps() { + + DEBUG(dbgs() << "\n*** Removing swaps ***\n\n"); + + bool Changed = false; + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + if (SwapVector[EntryIdx].WillRemove) { + Changed = true; + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + MachineBasicBlock *MBB = MI->getParent(); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addOperand(MI->getOperand(1)); + + DEBUG(dbgs() << format("Replaced %d with copy: ", + SwapVector[EntryIdx].VSEId)); + DEBUG(MI->dump()); + + MI->eraseFromParent(); + } + } + + return Changed; +} + +// For debug purposes, dump the contents of the swap vector. +void PPCVSXSwapRemoval::dumpSwapVector() { + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + int ID = SwapVector[EntryIdx].VSEId; + + DEBUG(dbgs() << format("%6d", ID)); + DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID))); + DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber())); + DEBUG(dbgs() << format(" %14s ", TII->getName(MI->getOpcode()))); + + if (SwapVector[EntryIdx].IsLoad) + DEBUG(dbgs() << "load "); + if (SwapVector[EntryIdx].IsStore) + DEBUG(dbgs() << "store "); + if (SwapVector[EntryIdx].IsSwap) + DEBUG(dbgs() << "swap "); + if (SwapVector[EntryIdx].MentionsPhysVR) + DEBUG(dbgs() << "physreg "); + if (SwapVector[EntryIdx].MentionsPartialVR) + DEBUG(dbgs() << "partialreg "); + + if (SwapVector[EntryIdx].IsSwappable) { + DEBUG(dbgs() << "swappable "); + switch(SwapVector[EntryIdx].SpecialHandling) { + default: + DEBUG(dbgs() << "special:**unknown**"); + break; + case SH_NONE: + break; + case SH_EXTRACT: + DEBUG(dbgs() << "special:extract "); + break; + case SH_INSERT: + DEBUG(dbgs() << "special:insert "); + break; + case SH_NOSWAP_LD: + DEBUG(dbgs() << "special:load "); + break; + case SH_NOSWAP_ST: + DEBUG(dbgs() << "special:store "); + break; + case SH_SPLAT: + DEBUG(dbgs() << "special:splat "); + break; + case SH_XXPERMDI: + DEBUG(dbgs() << "special:xxpermdi "); + break; + case SH_COPYWIDEN: + DEBUG(dbgs() << "special:copywiden "); + break; + } + } + + if (SwapVector[EntryIdx].WebRejected) + DEBUG(dbgs() << "rejected "); + if (SwapVector[EntryIdx].WillRemove) + DEBUG(dbgs() << "remove "); + + DEBUG(dbgs() << "\n"); + + // For no-asserts builds. + (void)MI; + (void)ID; + } + + DEBUG(dbgs() << "\n"); +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) +INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) + +char PPCVSXSwapRemoval::ID = 0; +FunctionPass* +llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); } diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp new file mode 100644 index 0000000..5b2fe19 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -0,0 +1,26 @@ +//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::ThePPC32Target, llvm::ThePPC64Target, llvm::ThePPC64LETarget; + +extern "C" void LLVMInitializePowerPCTargetInfo() { + RegisterTarget<Triple::ppc, /*HasJIT=*/true> + X(ThePPC32Target, "ppc32", "PowerPC 32"); + + RegisterTarget<Triple::ppc64, /*HasJIT=*/true> + Y(ThePPC64Target, "ppc64", "PowerPC 64"); + + RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> + Z(ThePPC64LETarget, "ppc64le", "PowerPC 64 LE"); +} |