diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
55 files changed, 1592 insertions, 989 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9b6a100..6458d56 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2031,18 +2031,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments( unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; - std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[i].OrigArgIndex; - - // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - ValVT = MVT::i8; - else if (ActualMVT == MVT::i16) - ValVT = MVT::i16; + if (Ins[i].isOrigArg()) { + std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[i].getOrigArgIndex(); + // Get type of the original argument. + EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ValVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ValVT = MVT::i16; + } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index e4bea5f..a3a76e6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3092,8 +3092,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; + if (Ins[VA.getValNo()].isOrigArg()) { + std::advance(CurOrigArg, + Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); + } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); @@ -3173,7 +3176,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); - int index = ArgLocs[i].getValNo(); + int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. @@ -3186,6 +3189,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { + assert(Ins[index].isOrigArg() && + "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign()); diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 7db5b34..699dfae 100644 --- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -3667,43 +3667,44 @@ bool MipsAsmParser::parseDirectiveModule() { return false; } - if (Lexer.is(AsmToken::Identifier)) { - StringRef Option = Parser.getTok().getString(); - Parser.Lex(); - - if (Option == "oddspreg") { - getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32()); - clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + StringRef Option; + if (Parser.parseIdentifier(Option)) { + reportParseError("expected .module option identifier"); + return false; + } - if (getLexer().isNot(AsmToken::EndOfStatement)) { - reportParseError("unexpected token, expected end of statement"); - return false; - } + if (Option == "oddspreg") { + getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32()); + clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); return false; - } else if (Option == "nooddspreg") { - if (!isABI_O32()) { - Error(L, "'.module nooddspreg' requires the O32 ABI"); - return false; - } + } - getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32()); - setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + return false; // parseDirectiveModule has finished successfully. + } else if (Option == "nooddspreg") { + if (!isABI_O32()) { + Error(L, "'.module nooddspreg' requires the O32 ABI"); + return false; + } - if (getLexer().isNot(AsmToken::EndOfStatement)) { - reportParseError("unexpected token, expected end of statement"); - return false; - } + getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32()); + setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); return false; - } else if (Option == "fp") { - return parseDirectiveModuleFP(); } + return false; // parseDirectiveModule has finished successfully. + } else if (Option == "fp") { + return parseDirectiveModuleFP(); + } else { return Error(L, "'" + Twine(Option) + "' is not a valid .module option."); } - - return false; } /// parseDirectiveModuleFP diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index da33f3b..d42b948 100644 --- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -259,6 +259,11 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -304,6 +309,10 @@ static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1118,6 +1127,23 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeCacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = fieldFromInstruction(Insn, 7, 9); + unsigned Hint = fieldFromInstruction(Insn, 16, 5); + unsigned Base = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::CreateReg(Base)); + Inst.addOperand(MCOperand::CreateImm(Offset)); + Inst.addOperand(MCOperand::CreateImm(Hint)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1354,6 +1380,23 @@ static DecodeStatus DecodeFMem3(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<11>(Insn & 0x07ff); + unsigned Reg = fieldFromInstruction(Insn, 16, 5); + unsigned Base = fieldFromInstruction(Insn, 11, 5); + + Reg = getReg(Decoder, Mips::COP2RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::CreateReg(Reg)); + Inst.addOperand(MCOperand::CreateReg(Base)); + Inst.addOperand(MCOperand::CreateImm(Offset)); + + return MCDisassembler::Success; +} static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn, uint64_t Address, diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td index 3e1d047..5ad5683 100644 --- a/contrib/llvm/lib/Target/Mips/Mips.td +++ b/contrib/llvm/lib/Target/Mips/Mips.td @@ -58,15 +58,15 @@ def MipsInstrInfo : InstrInfo; //===----------------------------------------------------------------------===// def FeatureNoABICalls : SubtargetFeature<"noabicalls", "NoABICalls", "true", - "Disable SVR4-style position-independent code.">; + "Disable SVR4-style position-independent code">; def FeatureGP64Bit : SubtargetFeature<"gp64", "IsGP64bit", "true", - "General Purpose Registers are 64-bit wide.">; + "General Purpose Registers are 64-bit wide">; def FeatureFP64Bit : SubtargetFeature<"fp64", "IsFP64bit", "true", - "Support 64-bit FP registers.">; + "Support 64-bit FP registers">; def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true", - "Support for FPXX.">; + "Support for FPXX">; def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true", - "IEEE 754-2008 NaN encoding.">; + "IEEE 754-2008 NaN encoding">; def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat", "true", "Only supports single precision float">; def FeatureO32 : SubtargetFeature<"o32", "ABI", "MipsABIInfo::O32()", @@ -81,7 +81,7 @@ def FeatureNoOddSPReg : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false", "Disable odd numbered single-precision " "registers">; def FeatureVFPU : SubtargetFeature<"vfpu", "HasVFPU", - "true", "Enable vector FPU instructions.">; + "true", "Enable vector FPU instructions">; def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1", "Mips I ISA Support [highly experimental]">; def FeatureMips2 : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2", diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index 976becc..7e1fbfd 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -293,6 +293,9 @@ void Mips16InstrInfo::adjustStackPtrBigUnrestricted( void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + if (Amount == 0) + return; + if (isInt<16>(Amount)) // need to change to addiu sp, ....and isInt<16> BuildAddiuSpImm(MBB, I, Amount); else diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index 185d12e..49c6322 100644 --- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -379,7 +379,6 @@ class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd, list<dag> Pattern = []; bit isTerminator = 1; bit hasDelaySlot = 0; - string DecoderMethod = "DecodeSimm16"; } class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16, @@ -550,6 +549,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd, dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); list<dag> Pattern = []; + string DecoderMethod = "DecodeCacheOpR6"; } class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>; @@ -561,6 +561,7 @@ class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> { string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); list<dag> Pattern = []; bit mayLoad = 1; + string DecoderMethod = "DecodeFMemCop2R6"; } class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>; @@ -572,6 +573,7 @@ class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> { string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); list<dag> Pattern = []; bit mayStore = 1; + string DecoderMethod = "DecodeFMemCop2R6"; } class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>; diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp index e18cc8b..b808129 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp @@ -132,8 +132,8 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128( continue; } - assert(Ins[i].OrigArgIndex < MF.getFunction()->arg_size()); - std::advance(FuncArg, Ins[i].OrigArgIndex); + assert(Ins[i].getOrigArgIndex() < MF.getFunction()->arg_size()); + std::advance(FuncArg, Ins[i].getOrigArgIndex()); OriginalArgWasF128.push_back( originalTypeIsF128(FuncArg->getType(), nullptr)); diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td index 90bf9f3..dcd88f2 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td @@ -123,7 +123,7 @@ def CC_MipsN_SoftFloat : CallingConv<[ ]>; def CC_MipsN : CallingConv<[ - CCIfType<[i8, i16, i32], + CCIfType<[i8, i16, i32, i64], CCIfSubtargetNot<"isLittle()", CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>, @@ -159,6 +159,10 @@ def CC_MipsN : CallingConv<[ // N32/64 variable arguments. // All arguments are passed in integer registers. def CC_MipsN_VarArg : CallingConv<[ + CCIfType<[i8, i16, i32, i64], + CCIfSubtargetNot<"isLittle()", + CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>, + // All integers are promoted to 64-bit. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index c6045fe..37fc784 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -619,6 +619,33 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2); + + ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse); + if (!FalseC || FalseC->getZExtValue()) + return SDValue(); + + // Since RHS (False) is 0, we swap the order of the True/False operands + // (obviously also inverting the condition) so that we can + // take advantage of conditional moves using the $0 register. + // Example: + // return (a != 0) ? x : 0; + // load $reg, x + // movz $reg, $0, a + unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F : + MipsISD::CMovFP_T; + + SDValue FCC = N->getOperand(1), Glue = N->getOperand(3); + return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(), + ValueIfFalse, FCC, ValueIfTrue, Glue); +} + static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const MipsSubtarget &Subtarget) { @@ -752,6 +779,9 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return performDivRemCombine(N, DAG, DCI, Subtarget); case ISD::SELECT: return performSELECTCombine(N, DAG, DCI, Subtarget); + case MipsISD::CMovFP_F: + case MipsISD::CMovFP_T: + return performCMovFPCombine(N, DAG, DCI, Subtarget); case ISD::AND: return performANDCombine(N, DAG, DCI, Subtarget); case ISD::OR: @@ -2039,7 +2069,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op, SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo); SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt); SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt, - DAG.getConstant(0x20, MVT::i32)); + DAG.getConstant(VT.getSizeInBits(), MVT::i32)); Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, DAG.getConstant(0, VT), ShiftLeftLo); Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or); @@ -2078,11 +2108,12 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shamt); SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt, - DAG.getConstant(0x20, MVT::i32)); - SDValue Shift31 = DAG.getNode(ISD::SRA, DL, VT, Hi, DAG.getConstant(31, VT)); + DAG.getConstant(VT.getSizeInBits(), MVT::i32)); + SDValue Ext = DAG.getNode(ISD::SRA, DL, VT, Hi, + DAG.getConstant(VT.getSizeInBits() - 1, VT)); Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or); Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, - IsSRA ? Shift31 : DAG.getConstant(0, VT), ShiftRightHi); + IsSRA ? Ext : DAG.getConstant(0, VT), ShiftRightHi); SDValue Ops[2] = {Lo, Hi}; return DAG.getMergeValues(Ops, DL); @@ -2902,13 +2933,16 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[i].OrigArgIndex; + if (Ins[i].isOrigArg()) { + std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[i].getOrigArgIndex(); + } EVT ValVT = VA.getValVT(); ISD::ArgFlagsTy Flags = Ins[i].Flags; bool IsRegLoc = VA.isRegLoc(); if (Flags.isByVal()) { + assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit"); unsigned FirstByValReg, LastByValReg; unsigned ByValIdx = CCInfo.getInRegsParamsProcessed(); CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg); @@ -3029,6 +3063,15 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_Mips); } +bool +MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const { + if (Subtarget.hasMips3() && Subtarget.abiUsesSoftFloat()) { + if (Type == MVT::i32) + return true; + } + return IsSigned; +} + SDValue MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index 4132de6..4da337a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -472,6 +472,8 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override; + // Inline asm support ConstraintType getConstraintType(const std::string &Constraint) const override; diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td index 2aa8328..ed97cb4 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td @@ -458,42 +458,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>, defm FSUB : ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>; def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>, - MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6; + MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6; def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>, - MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6; + MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6; let AdditionalPredicates = [NoNaNsFPMath] in { def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>, - MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6; + MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6; def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>, - MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6; + MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6; } def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>, - MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; + MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>, - MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; + MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; let AdditionalPredicates = [NoNaNsFPMath] in { def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>, - MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; + MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>, - MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; + MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; } -let isCodeGenOnly=1 in { +let DecoderNamespace = "Mips64" in { def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>, - MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; + MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>, - MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; + MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; } let AdditionalPredicates = [NoNaNsFPMath], - isCodeGenOnly=1 in { + DecoderNamespace = "Mips64" in { def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>, - MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; + MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>, - MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; + MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td index 2b3b6c1..1eb8d9a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td @@ -388,6 +388,8 @@ def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128, (sequence "W%u", 0, 31)>; def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128, (sequence "W%u", 0, 31)>; +def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128, + (decimate (sequence "W%u", 0, 31), 2)>; def MSACtrl: RegisterClass<"Mips", [i32], 32, (add MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>; diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index cb7ac40..8c2620c 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -258,8 +258,12 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag, CurDAG->getTargetConstant(Mips::sub_32, VT)); } - SDNode *AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, - SDValue(Carry, 0), RHS); + // Generate a second addition only if we know that RHS is not a + // constant-zero node. + SDNode *AddCarry = Carry; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS); + if (!C || C->getZExtValue()) + AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS); return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0)); diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 3f865af..2c033ce 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -2883,10 +2883,21 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{ unsigned Ws = MI->getOperand(1).getReg(); unsigned Lane = MI->getOperand(2).getImm(); - if (Lane == 0) - BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo); - else { - unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + if (Lane == 0) { + unsigned Wt = Ws; + if (!Subtarget.useOddSPReg()) { + // We must copy to an even-numbered MSA register so that the + // single-precision sub-register is also guaranteed to be even-numbered. + Wt = RegInfo.createVirtualRegister(&Mips::MSA128WEvensRegClass); + + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Wt).addReg(Ws); + } + + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo); + } else { + unsigned Wt = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : + &Mips::MSA128WEvensRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane); BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo); @@ -2948,7 +2959,9 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI, unsigned Wd_in = MI->getOperand(1).getReg(); unsigned Lane = MI->getOperand(2).getImm(); unsigned Fs = MI->getOperand(3).getReg(); - unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + unsigned Wt = RegInfo.createVirtualRegister( + Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass : + &Mips::MSA128WEvensRegClass); BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt) .addImm(0) diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index 74f291f..180b043 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -364,6 +364,9 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu; unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu; + if (Amount == 0) + return; + if (isInt<16>(Amount))// addi sp, sp, amount BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount); else { // Expand immediate that doesn't fit in 16-bit. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5c52bb1..e580c81 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2688,9 +2688,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; - std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[ArgNo].OrigArgIndex; - + if (Ins[ArgNo].isOrigArg()) { + std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[ArgNo].getOrigArgIndex(); + } /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); @@ -2704,6 +2705,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { + assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); + // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; @@ -3064,9 +3067,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin( unsigned ObjSize = ObjectVT.getSizeInBits()/8; unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; - std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[ArgNo].OrigArgIndex; - + if (Ins[ArgNo].isOrigArg()) { + std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[ArgNo].getOrigArgIndex(); + } unsigned CurArgOffset = ArgOffset; // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. @@ -3087,6 +3091,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin( // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { + assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); + // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.td b/contrib/llvm/lib/Target/R600/AMDGPU.td index 1df4448..03f2bbe 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPU.td +++ b/contrib/llvm/lib/Target/R600/AMDGPU.td @@ -97,6 +97,11 @@ def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "true", "Enable spilling of VGPRs to scratch memory">; +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp index b545b45..0b426bc 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp @@ -40,7 +40,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { std::vector<Function*> FuncsToClone; for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Function &F = *I; - if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty()) + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(Attribute::NoInline)) FuncsToClone.push_back(&F); } @@ -54,7 +55,7 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Function &F = *I; - if (F.hasLocalLinkage()) { + if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { F.addFnAttr(Attribute::AlwaysInline); } } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp index 6185e36..1fae26e 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -343,6 +343,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; + if (STM.hasSGPRInitBug()) { + if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) + llvm_unreachable("Too many SGPRs used with the SGPR init bug"); + + ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 15112c7..68d557a 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -439,6 +439,31 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } + case ISD::STORE: { + // Handle i64 stores here for the same reason mentioned above for loads. + StoreSDNode *ST = cast<StoreSDNode>(N); + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + break; + + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } + + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); + break; + } + case AMDGPUISD::REGISTER_LOAD: { if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; @@ -761,6 +786,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -770,19 +797,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); - const SDValue False = CurDAG->getTargetConstant(0, MVT::i1); - SDValue Ops[] = { - Zero, // src0_modifiers - N->getOperand(0), // src0 - Zero, // src1_modifiers - N->getOperand(1), // src1 - Zero, // src2_modifiers - N->getOperand(2), // src2 - False, // clamp - Zero // omod - }; + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[8]; + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 2adcdf1..b137053 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -141,9 +141,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); - setOperationAction(ISD::STORE, MVT::i64, Promote); - AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); - setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -162,9 +159,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // Custom lowering of vector stores is required for local address space // stores. setOperationAction(ISD::STORE, MVT::v4i32, Custom); - // XXX: Native v2i32 local address space stores are possible, but not - // currently implemented. - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); @@ -832,11 +826,9 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; - SDValue A = Op.getOperand(0); - SDValue B = Op.getOperand(1); - DAG.ExtractVectorElements(A, Args); - DAG.ExtractVectorElements(B, Args); + for (const SDUse &U : Op->ops()) + DAG.ExtractVectorElements(U.get(), Args); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); } @@ -881,9 +873,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerIntrinsicIABS(Op, DAG); case AMDGPUIntrinsic::AMDGPU_lrp: return LowerIntrinsicLRP(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_clamp: case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. @@ -913,10 +902,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::AMDGPU_div_fmas: - // FIXME: Dropping bool parameter. Work is needed to support the implicit - // read from VCC. return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); case Intrinsic::AMDGPU_div_fixup: return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h index e28ce0f..202183c 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h @@ -140,6 +140,12 @@ public: /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td index 0e34392b..d657ad0 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -35,6 +35,11 @@ def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; +// float, float, float, vcc +def AMDGPUFmasOp : SDTypeProfile<1, 4, + [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -153,7 +158,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; // Special case divide FMA with scale and flags (src0 = Quotient, // src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; // Single or double precision division fixup. // Special case divide fixup and flags(src0 = Quotient, src1 = diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td index 4e536c3..34b1fc8 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -164,10 +164,6 @@ class PrivateStore <SDPatternOperator op> : PrivateMemOp < (ops node:$value, node:$ptr), (op node:$value, node:$ptr) >; -def extloadi8_private : PrivateLoad <extloadi8>; -def sextloadi8_private : PrivateLoad <sextloadi8>; -def extloadi16_private : PrivateLoad <extloadi16>; -def sextloadi16_private : PrivateLoad <sextloadi16>; def load_private : PrivateLoad <load>; def truncstorei8_private : PrivateStore <truncstorei8>; @@ -231,6 +227,9 @@ def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def extloadi8_private : PrivateLoad <az_extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; + def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; }]>; @@ -267,6 +266,9 @@ def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def extloadi16_private : PrivateLoad <az_extloadi16>; +def sextloadi16_private : PrivateLoad <sextloadi16>; + def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; }]>; @@ -649,17 +651,10 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat < (RcpInst $src) >; -multiclass RsqPat<Instruction RsqInst, ValueType vt> { - def : Pat < - (fdiv FP_ONE, (fsqrt vt:$src)), - (RsqInst $src) - >; - - def : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) - >; -} +class RsqPat<Instruction RsqInst, ValueType vt> : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; include "R600Instructions.td" include "R700Instructions.td" diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td index eee9c29..ab489cd 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td @@ -68,6 +68,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; } diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp index b1c7498..87cdb5f 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp @@ -80,7 +80,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, FlatAddressSpace(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), + EnableVGPRSpilling(false),SGPRInitBug(false), DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h index 566b45c..eeb41d3 100644 --- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h @@ -45,6 +45,10 @@ public: VOLCANIC_ISLANDS, }; + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + private: std::string DevName; bool Is64bit; @@ -66,6 +70,7 @@ private: bool CFALUBug; int LocalMemorySize; bool EnableVGPRSpilling; + bool SGPRInitBug; const DataLayout DL; AMDGPUFrameLowering FrameLowering; @@ -203,6 +208,10 @@ public: return LocalMemorySize; } + bool hasSGPRInitBug() const { + return SGPRInitBug; + } + unsigned getAmdKernelCodeChipID() const; bool enableMachineScheduler() const override { diff --git a/contrib/llvm/lib/Target/R600/CaymanInstructions.td b/contrib/llvm/lib/Target/R600/CaymanInstructions.td index 58b5ce2..433c3fc 100644 --- a/contrib/llvm/lib/Target/R600/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/R600/CaymanInstructions.td @@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -defm : RsqPat<RECIPSQRT_IEEE_cm, f32>; +def : RsqPat<RECIPSQRT_IEEE_cm, f32>; def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; diff --git a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td index f24f76b..299d1fa 100644 --- a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td @@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -defm : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def : RsqPat<RECIPSQRT_IEEE_eg, f32>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 8271c6f..b66ed10 100644 --- a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -291,6 +291,8 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printImmediate64(Op.getImm(), O); else llvm_unreachable("Invalid register class size"); + } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + printImmediate32(Op.getImm(), O); } else { // We hit this for the immediate instruction bits that don't yet have a // custom printer. diff --git a/contrib/llvm/lib/Target/R600/Processors.td b/contrib/llvm/lib/Target/R600/Processors.td index cff97cd..e5fef0c 100644 --- a/contrib/llvm/lib/Target/R600/Processors.td +++ b/contrib/llvm/lib/Target/R600/Processors.td @@ -113,8 +113,12 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; // Volcanic Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"tonga", SIFullSpeedModel, [FeatureVolcanicIslands]>; +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; -def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp index 595f698..2e1b094 100644 --- a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -838,6 +838,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::AMDGPU_rsq: // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; @@ -1694,7 +1698,7 @@ SDValue R600TargetLowering::LowerFormalArguments( // XXX - I think PartOffset should give you this, but it seems to give the // size of the register which isn't useful. - unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset(); + unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); unsigned Offset = 36 + VA.getLocMemOffset(); diff --git a/contrib/llvm/lib/Target/R600/R600Instructions.td b/contrib/llvm/lib/Target/R600/R600Instructions.td index b1d3ce2..05957d2 100644 --- a/contrib/llvm/lib/Target/R600/R600Instructions.td +++ b/contrib/llvm/lib/Target/R600/R600Instructions.td @@ -1193,7 +1193,7 @@ let Predicates = [isR600] in { def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - defm : RsqPat<RECIPSQRT_IEEE_r600, f32>; + def : RsqPat<RECIPSQRT_IEEE_r600, f32>; def : FROUNDPat <CNDGE_r600, CNDGT_r600>; diff --git a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp index c99219d..b8165fb 100644 --- a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -83,7 +83,7 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - Value *handleLoopCondition(Value *Cond, PHINode *Broken); + Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); void handleLoop(BranchInst *Term); @@ -207,8 +207,17 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) { - if (PHINode *Phi = dyn_cast<PHINode>(Cond)) { +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L) { + + // Only search through PHI nodes which are inside the loop. If we try this + // with PHI nodes that are outside of the loop, we end up inserting new PHI + // nodes outside of the loop which depend on values defined inside the loop. + // This will break the module with + // 'Instruction does not dominate all users!' errors. + PHINode *Phi = nullptr; + if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { + BasicBlock *Parent = Phi->getParent(); PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); Value *Ret = NewPhi; @@ -223,7 +232,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) } Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L); NewPhi->addIncoming(PhiArg, From); } @@ -253,7 +262,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); - TerminatorInst *Insert = Parent->getTerminator(); + Instruction *Insert; + if (L->contains(Inst)) { + Insert = Parent->getTerminator(); + } else { + Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + } Value *Args[] = { Cond, Broken }; return CallInst::Create(IfBreak, Args, "", Insert); @@ -265,14 +279,15 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + BasicBlock *BB = Term->getParent(); + llvm::Loop *L = LI->getLoopFor(BB); BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken); + Value *Arg = handleLoopCondition(Cond, Broken, L); - BasicBlock *BB = Term->getParent(); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { diff --git a/contrib/llvm/lib/Target/R600/SIDefines.h b/contrib/llvm/lib/Target/R600/SIDefines.h index 7601794..b540140 100644 --- a/contrib/llvm/lib/Target/R600/SIDefines.h +++ b/contrib/llvm/lib/Target/R600/SIDefines.h @@ -35,7 +35,8 @@ enum { SMRD = 1 << 16, DS = 1 << 17, MIMG = 1 << 18, - FLAT = 1 << 19 + FLAT = 1 << 19, + WQM = 1 << 20 }; } diff --git a/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp b/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp index d8ffa4f..cb24bba 100644 --- a/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp @@ -209,7 +209,12 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { APInt Imm; if (FoldingImm) { - const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg()); + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getRegClass(UseReg); + Imm = APInt(64, OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. @@ -228,8 +233,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // In order to fold immediates into copies, we need to change the // copy to a MOV. if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned MovOp = TII->getMovOpcode( - MRI.getRegClass(UseMI->getOperand(0).getReg())); + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) continue; diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp index 6b2ea06..32ae605 100644 --- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp @@ -89,8 +89,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -158,8 +156,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); @@ -214,6 +210,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); @@ -314,9 +311,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // XXX - CI changes say "Support for unaligned memory accesses" but I don't - // see what for specifically. The wording everywhere else seems to be the - // same. + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the @@ -328,12 +324,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return Align % 4 == 0; } + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32); + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; } EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, @@ -448,7 +450,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned j = 0; j != NumElements; ++j) { @@ -531,7 +533,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); const PointerType *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always @@ -566,7 +568,7 @@ SDValue SITargetLowering::LowerFormalArguments( if (Arg.VT.isVector()) { // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); SmallVector<SDValue, 4> Regs; @@ -919,6 +921,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1104,7 +1112,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { @@ -1125,11 +1196,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Store->getMemoryVT(); // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - VT.isVector() && VT.getVectorNumElements() == 2 && - VT.getVectorElementType() == MVT::i32) - return SDValue(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (VT.isVector() && VT.getVectorNumElements() > 4) return ScalarizeVectorStore(Op, DAG); @@ -1524,6 +1590,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::UMAX: case AMDGPUISD::UMIN: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMin3Max3Combine(N, DCI); break; diff --git a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp index 181b116..50f20ac 100644 --- a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp @@ -82,6 +82,8 @@ private: /// \brief Type of the last opcode. InstType LastOpcodeType; + bool LastInstWritesM0; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -106,6 +108,9 @@ private: /// \brief Resolve all operand dependencies to counter requirements Counters handleOperands(MachineInstr &MI); + /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. + void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + public: SIInsertWaits(TargetMachine &tm) : MachineFunctionPass(ID), @@ -269,6 +274,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) .addImm(0); + LastInstWritesM0 = false; } if (TII->isSMRD(I->getOpcode())) @@ -362,6 +368,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, ((Counts.Named.LGKM & 0x7) << 8)); LastOpcodeType = OTHER; + LastInstWritesM0 = false; return true; } @@ -403,6 +410,30 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return; + + // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. + if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + LastInstWritesM0 = false; + return; + } + + // Set whether this instruction sets M0 + LastInstWritesM0 = false; + + unsigned NumOperands = I->getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + const MachineOperand &Op = I->getOperand(i); + + if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) + LastInstWritesM0 = true; + } +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -417,6 +448,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { WaitedOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; + LastInstWritesM0 = false; memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -433,7 +465,9 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { Changes |= insertWait(MBB, I, LastIssued); else Changes |= insertWait(MBB, I, handleOperands(*I)); + pushInstruction(MBB, I); + handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB diff --git a/contrib/llvm/lib/Target/R600/SIInstrFormats.td b/contrib/llvm/lib/Target/R600/SIInstrFormats.td index 09c0cbe..b825208 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/R600/SIInstrFormats.td @@ -38,6 +38,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> DS = 0; field bits<1> MIMG = 0; field bits<1> FLAT = 0; + field bits<1> WQM = 0; // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; @@ -64,6 +65,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{17} = DS; let TSFlags{18} = MIMG; let TSFlags{19} = FLAT; + let TSFlags{20} = WQM; // Most instructions require adjustments after selection to satisfy // operand requirements. @@ -295,18 +297,32 @@ class VOP1e <bits<8> op> : Enc32 { } class VOP2e <bits<6> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; - bits<8> VDST; - bits<9> SRC0; - bits<8> VSRC1; - - let Inst{8-0} = SRC0; - let Inst{16-9} = VSRC1; - let Inst{24-17} = VDST; + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = vdst; let Inst{30-25} = op; let Inst{31} = 0x0; //encoding } +class VOP2_MADKe <bits<6> op> : Enc64 { + + bits<8> vdst; + bits<9> src0; + bits<8> vsrc1; + bits<32> src2; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = src2; +} + class VOP3e <bits<9> op> : Enc64 { bits<8> dst; @@ -554,9 +570,6 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : VOP2Common <outs, ins, asm, pattern>, VOP2e<op>; -class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern>, VOP3be<op>; - class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : VOPCCommon <ins, asm, pattern>, VOPCe <op>; @@ -585,9 +598,6 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> : let SchedRW = [WriteLDS]; } -class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - DS <outs, ins, asm, pattern>, DSe<op>; - class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp index 80b560e..5ab33b4 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp @@ -121,12 +121,20 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (Load0->getOperand(0) != Load1->getOperand(0)) return false; + const ConstantSDNode *Load0Offset = + dyn_cast<ConstantSDNode>(Load0->getOperand(1)); + const ConstantSDNode *Load1Offset = + dyn_cast<ConstantSDNode>(Load1->getOperand(1)); + + if (!Load0Offset || !Load1Offset) + return false; + // Check chain. if (findChainOperand(Load0) != findChainOperand(Load1)) return false; - Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue(); - Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue(); + Offset0 = Load0Offset->getZExtValue(); + Offset1 = Load1Offset->getZExtValue(); return true; } @@ -333,6 +341,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (DestReg == AMDGPU::VCC) { + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -408,11 +431,15 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { int NewOpc; // Try to map original to commuted opcode - if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1) + NewOpc = AMDGPU::getCommuteRev(Opcode); + // Check if the commuted (REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) return NewOpc; // Try to map commuted to original opcode - if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1) + NewOpc = AMDGPU::getCommuteOrig(Opcode); + // Check if the original (non-REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) return NewOpc; return Opcode; @@ -1121,6 +1148,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } + int RegClass = Desc.OpInfo[i].RegClass; + switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: if (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) { @@ -1131,7 +1160,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: - if (MI->getOperand(i).isImm() && !isInlineConstant(MI->getOperand(i))) { + if (isLiteralConstant(MI->getOperand(i))) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1152,7 +1181,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, if (!MI->getOperand(i).isReg()) continue; - int RegClass = Desc.OpInfo[i].RegClass; if (RegClass != -1) { unsigned Reg = MI->getOperand(i).getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) @@ -1197,31 +1225,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } - // Verify SRC1 for VOP2 and VOPC - if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) { - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - if (Src1.isImm()) { - ErrInfo = "VOP[2C] src1 cannot be an immediate."; - return false; - } - } - - // Verify VOP3 - if (isVOP3(Opcode)) { - if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) { - ErrInfo = "VOP3 src0 cannot be a literal constant."; - return false; - } - if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) { - ErrInfo = "VOP3 src1 cannot be a literal constant."; - return false; - } - if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) { - ErrInfo = "VOP3 src2 cannot be a literal constant."; - return false; - } - } - // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { @@ -1292,6 +1295,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; + case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; } } @@ -2043,6 +2047,24 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { swapOperands(Inst); } break; + case AMDGPU::S_LSHL_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B64; + swapOperands(Inst); + } + break; case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.h b/contrib/llvm/lib/Target/R600/SIInstrInfo.h index 28cd27d..1298030 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.h @@ -204,6 +204,10 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + bool isWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::WQM; + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; @@ -243,7 +247,27 @@ public: /// the register class of its machine operand. /// to infer the correct register class base on the other operands. const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const;\ + unsigned OpNo) const; + + /// \brief Return the size in bytes of the operand OpNo on the given + // instruction opcode. + unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; + + if (OpInfo.RegClass == -1) { + // If this is an immediate operand, this must be a 32-bit literal. + assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); + return 4; + } + + return RI.getRegClass(OpInfo.RegClass)->getSize(); + } + + /// \brief This form should usually be preferred since it handles operands + /// with unknown register classes. + unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + return getOpRegClass(MI, OpNo)->getSize(); + } /// \returns true if it is legal for the operand at index \p OpNo /// to read a VGPR. diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.td b/contrib/llvm/lib/Target/R600/SIInstrInfo.td index 175e11d..a749e7f 100644 --- a/contrib/llvm/lib/Target/R600/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.td @@ -383,15 +383,13 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : let isPseudo = 1; } -class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOP1 <outs, ins, asm, pattern>, +class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, SOP1e <op.SI>, SIMCInstr<opName, SISubtarget.SI>; -class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOP1 <outs, ins, asm, pattern>, +class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, SOP1e <op.VI>, SIMCInstr<opName, SISubtarget.VI>; @@ -400,10 +398,10 @@ multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> { pattern>; def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; } multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> { @@ -411,10 +409,10 @@ multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> { pattern>; def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; } // no input, 64-bit output. @@ -422,12 +420,12 @@ multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), - opName#" $dst", pattern> { + opName#" $dst"> { let SSRC0 = 0; } def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), - opName#" $dst", pattern> { + opName#" $dst"> { let SSRC0 = 0; } } @@ -438,10 +436,10 @@ multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> { pattern>; def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; } class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : @@ -451,15 +449,13 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : let Size = 4; } -class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOP2<outs, ins, asm, pattern>, +class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, SOP2e<op.SI>, SIMCInstr<opName, SISubtarget.SI>; -class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOP2<outs, ins, asm, pattern>, +class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, SOP2e<op.VI>, SIMCInstr<opName, SISubtarget.VI>; @@ -469,11 +465,11 @@ multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]", pattern>; + opName#" $dst, $src0, $src1 [$scc]">; def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]", pattern>; + opName#" $dst, $src0, $src1 [$scc]">; } multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> { @@ -481,10 +477,10 @@ multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> { (ins SSrc_32:$src0, SSrc_32:$src1), pattern>; def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; } multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> { @@ -492,10 +488,10 @@ multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> { (ins SSrc_64:$src0, SSrc_64:$src1), pattern>; def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1">; def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1">; } multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> { @@ -503,10 +499,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> { (ins SSrc_64:$src0, SSrc_32:$src1), pattern>; def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), - (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; + (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1">; } @@ -527,15 +523,13 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : let isPseudo = 1; } -class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOPK <outs, ins, asm, pattern>, +class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, SOPKe <op.SI>, SIMCInstr<opName, SISubtarget.SI>; -class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - SOPK <outs, ins, asm, pattern>, +class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, SOPKe <op.VI>, SIMCInstr<opName, SISubtarget.VI>; @@ -544,10 +538,10 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { pattern>; def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), - opName#" $dst, $src0", pattern>; + opName#" $dst, $src0">; } multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { @@ -555,10 +549,10 @@ multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { (ins SReg_32:$src0, u16imm:$src1), pattern>; def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>; + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>; + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; } //===----------------------------------------------------------------------===// @@ -792,6 +786,7 @@ def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { let Src0RC32 = VCSrc_32; @@ -808,9 +803,14 @@ def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { } def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); + field string Asm = " $dst, $src0, $vsrc1, $src2"; +} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; @@ -847,6 +847,15 @@ multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, SIMCInstr <opName#"_e32", SISubtarget.VI>; } +multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + // No VI instruction. This class is for SI only. +} + class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP2Common <outs, ins, "", pattern>, VOP <opName>, @@ -855,25 +864,22 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : } multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOpSI> { + string opName, string revOp> { def "" : VOP2_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>; + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>, SIMCInstr <opName#"_e32", SISubtarget.SI>; } multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOpSI, string revOpVI> { + string opName, string revOp> { def "" : VOP2_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>; + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, - VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>, SIMCInstr <opName#"_e32", SISubtarget.SI>; def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>, - VOP2_REV<revOpVI#"_e32_vi", !eq(revOpVI, opName)>, SIMCInstr <opName#"_e32", SISubtarget.VI>; } @@ -905,6 +911,16 @@ class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : VOP3e_vi <op>, SIMCInstr <opName#"_e64", SISubtarget.VI>; +class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI>; + +class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI>; + multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, int NumSrcArgs, bit HasMods = 1> { @@ -946,24 +962,45 @@ multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, VOP3DisableFields<0, 0, HasMods>; } +multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + // No VI instruction. This class is for SI only. +} + multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOpSI, string revOpVI, + list<dag> pattern, string opName, string revOp, bit HasMods = 1, bit UseFullOp = 0> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOpSI#"_e64", !eq(revOpSI, opName)>; + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _si : VOP3_Real_si <op.SI3, - outs, ins, asm, opName>, - VOP2_REV<revOpSI#"_e64_si", !eq(revOpSI, opName)>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods>; +} + +multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _vi : VOP3_Real_vi <op.VI3, - outs, ins, asm, opName>, - VOP2_REV<revOpVI#"_e64_vi", !eq(revOpVI, opName)>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods>; + + // No VI instruction. This class is for SI only. } +// XXX - Is v_div_scale_{f32|f64} only available in vop3b without +// option of implicit vcc use? multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, bit HasMods = 1, bit UseFullOp = 0> { @@ -974,19 +1011,27 @@ multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, // can write it into any SGPR. We currently don't use the carry out, // so for now hardcode it to VCC as well. let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b <op.SI3, outs, ins, asm, pattern>, - VOP3DisableFields<1, 0, HasMods>, - SIMCInstr<opName#"_e64", SISubtarget.SI>, - VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>; - - // TODO: Do we need this VI variant here? - /*def _vi : VOP3b_vi <op.VI3, outs, ins, asm, pattern>, - VOP3DisableFields<1, 0, HasMods>, - SIMCInstr<opName#"_e64", SISubtarget.VI>, - VOP2_REV<revOp#"_e64_vi", !eq(revOp, opName)>;*/ + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; } // End sdst = SIOperand.VCC, Defs = [VCC] } +multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; +} + multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods, bit defExec> { @@ -1046,33 +1091,30 @@ multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> { - def _e32 : VOP1 <op.SI, P.Outs, P.Ins32, opName#P.Asm32, []>, - VOP <opName>; + defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; - def _e64 : VOP3Common <P.Outs, P.Ins64, opName#P.Asm64, + defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))])>, - VOP <opName>, - VOP3e <op.SI3>, - VOP3DisableFields<0, 0, P.HasModifiers>; + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + opName, P.HasModifiers>; } multiclass VOP2_Helper <vop2 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, - string revOpSI, string revOpVI, bit HasMods> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOpSI, revOpVI>; + string revOp, bit HasMods> { + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3_2_m <op, - outs, ins64, opName#"_e64"#asm64, pat64, opName, revOpSI, revOpVI, HasMods + outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods >; } multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, - string revOpSI = opName, string revOpVI = revOpSI> : VOP2_Helper < + string revOp = opName> : VOP2_Helper < op, opName, P.Outs, P.Ins32, P.Asm32, [], P.Ins64, P.Asm64, @@ -1082,15 +1124,30 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOpSI, revOpVI, P.HasModifiers + revOp, P.HasModifiers >; +multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> { + defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + opName, revOp, P.HasModifiers>; +} + multiclass VOP2b_Helper <vop2 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, string revOp, bit HasMods> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp, revOp>; + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3b_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods @@ -1116,16 +1173,16 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, - string revOpSI, string revOpVI, bit HasMods> { - defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOpSI>; + string revOp, bit HasMods> { + defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, - revOpSI, revOpVI, HasMods>; + revOp, HasMods>; } multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, - string revOpSI = opName, string revOpVI = revOpSI> + string revOp = opName> : VOP2_VI3_Helper < op, opName, P.Outs, P.Ins32, P.Asm32, [], @@ -1136,9 +1193,26 @@ multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOpSI, revOpVI, P.HasModifiers + revOp, P.HasModifiers >; +multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { + + def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>; + +let isCodeGenOnly = 0 in { + def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>, + VOP2_MADKe <op.SI>; + + def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>, + VOP2_MADKe <op.VI>; +} // End isCodeGenOnly = 0 +} + class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, @@ -1274,9 +1348,31 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, P.NumSrcArgs, P.HasModifiers >; +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +multiclass VOP3_VCC_Inst <vop3 op, string opName, + VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, + P.Outs, + (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, + InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, + InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, + ClampMod:$clamp, + omod:$omod), + " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))], + 3, 1 +>; + multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, string opName, list<dag> pattern> : - VOP3b_2_m < + VOP3b_3_m < op, (outs vrc:$vdst, SReg_64:$sdst), (ins InputModsNoDefault:$src0_modifiers, arc:$src0, InputModsNoDefault:$src1_modifiers, arc:$src1, @@ -1307,22 +1403,21 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< // Interpolation opcodes //===----------------------------------------------------------------------===// -class VINTRP_Pseudo <string opName, dag outs, dag ins, string asm, - list<dag> pattern> : - VINTRPCommon <outs, ins, asm, pattern>, +class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + VINTRPCommon <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; } class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, - string asm, list<dag> pattern> : - VINTRPCommon <outs, ins, asm, pattern>, + string asm> : + VINTRPCommon <outs, ins, asm, []>, VINTRPe <op>, SIMCInstr<opName, SISubtarget.SI>; class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, - string asm, list<dag> pattern> : - VINTRPCommon <outs, ins, asm, pattern>, + string asm> : + VINTRPCommon <outs, ins, asm, []>, VINTRPe_vi <op>, SIMCInstr<opName, SISubtarget.VI>; @@ -1331,11 +1426,11 @@ multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm, list<dag> pattern = []> { let DisableEncoding = disableEncoding, Constraints = constraints in { - def "" : VINTRP_Pseudo <opName, outs, ins, asm, pattern>; + def "" : VINTRP_Pseudo <opName, outs, ins, pattern>; - def _si : VINTRP_Real_si <op, opName, outs, ins, asm, pattern>; + def _si : VINTRP_Real_si <op, opName, outs, ins, asm>; - def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm, pattern>; + def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm>; } } @@ -1467,70 +1562,92 @@ multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", []>; -class DS_1A_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : - DS_si <op, outs, ins, asm, pat> { - bits<16> offset; - - // Single load interpret the 2 i8imm operands as a single i16 offset. - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; - - let hasSideEffects = 0; +// 1 address, 1 data. +multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1, + hasPostISelHook = 1 // Adjusted to no return version. + in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } -// 1 address, 1 data. -class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si < - op, +multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = ""> : DS_1A1D_RET_m < + op, asm, (outs rc:$vdst), (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), - asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>, - AtomicNoRet<noRetOp, 1> { - - let data1 = 0; - let mayStore = 1; - let mayLoad = 1; + asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", [], noRetOp>; - let hasPostISelHook = 1; // Adjusted to no return version. +// 1 address, 2 data. +multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1, + hasPostISelHook = 1 // Adjusted to no return version. + in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } } -// 1 address, 2 data. -class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si < - op, +multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = ""> : DS_1A2D_RET_m < + op, asm, (outs rc:$vdst), (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 1> { - let mayStore = 1; - let mayLoad = 1; - let hasPostISelHook = 1; // Adjusted to no return version. -} + [], noRetOp>; // 1 address, 2 data. -class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si < - op, +multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 0>; + + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = asm> : DS_1A2D_NORET_m < + op, asm, (outs), (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), asm#" $addr, $data0, $data1"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 0> { - let mayStore = 1; - let mayLoad = 1; -} + [], noRetOp>; // 1 address, 1 data. -class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si < - op, +multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 0>; + + let data1 = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } +} + +multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = asm> : DS_1A1D_NORET_m < + op, asm, (outs), (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), asm#" $addr, $data0"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 0> { - - let data1 = 0; - let mayStore = 1; - let mayLoad = 1; -} + [], noRetOp>; //===----------------------------------------------------------------------===// // MTBUF classes @@ -1596,45 +1713,111 @@ multiclass MTBUF_Load_Helper <bits<3> op, string opName, // MUBUF classes //===----------------------------------------------------------------------===// -class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { - let lds = 0; -} - -class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { - let lds = 0; +class mubuf <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; } class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { - bit IsAddr64 = is_addr64; string OpName = NAME # suffix; } -class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> - : MUBUF_si <op, outs, ins, asm, pattern> { +class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MUBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + + // dummy fields, so that we can use let statements around multiclasses + bits<1> offen; + bits<1> idxen; + bits<8> vaddr; + bits<1> glc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let lds = 0; +} - let offen = 0; - let idxen = 0; - let addr64 = 1; - let tfe = 0; +class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe_vi <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { let lds = 0; - let soffset = 128; } -class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> - : MUBUF_si <op, outs, ins, asm, pattern> { +multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0>; - let offen = 0; - let idxen = 0; - let addr64 = 0; - let tfe = 0; + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; +} + +multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, + dag ins, string asm, list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1>; + + let addr64 = 1 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { let lds = 0; - let vaddr = 0; } -multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, +multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_OFFSET", is_return>; + + let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_ADDR64", is_return>; + + let offen = 0, idxen = 0, addr64 = 1, tfe = 0, soffset = 128 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, ValueType vt, SDPatternOperator atomic> { let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { @@ -1642,208 +1825,135 @@ multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, // No return variants let glc = 0 in { - def _ADDR64 : MUBUFAtomicAddr64 < - op, (outs), + defm _ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_addr64", (outs), (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [] - >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>; + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [], 0 + >; - def _OFFSET : MUBUFAtomicOffset < - op, (outs), + defm _OFFSET : MUBUFAtomicOffset_m < + op, name#"_offset", (outs), (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [] - >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>; + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + >; } // glc = 0 // Variant that return values let glc = 1, Constraints = "$vdata = $vdata_in", DisableEncoding = "$vdata_in" in { - def _RTN_ADDR64 : MUBUFAtomicAddr64 < - op, (outs rc:$vdata), + defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset, - i1:$slc), vt:$vdata_in))] - >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>; + i1:$slc), vt:$vdata_in))], 1 + >; - def _RTN_OFFSET : MUBUFAtomicOffset < - op, (outs rc:$vdata), + defm _RTN_OFFSET : MUBUFAtomicOffset_m < + op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))] - >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>; + i1:$slc), vt:$vdata_in))], 1 + >; } // glc = 1 } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } -multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, +multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { let mayLoad = 1, mayStore = 0 in { - - let addr64 = 0 in { - - let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF_si <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), - asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", - [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, - i32:$soffset, i16:$offset, - i1:$glc, i1:$slc, i1:$tfe)))]>, - MUBUFAddr64Table<0>; - } - - let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF_si <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, - tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; - } - - let offen = 0, idxen = 1 in { - def _IDXEN : MUBUF_si <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VGPR_32:$vaddr, - mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; - } - - let offen = 1, idxen = 1 in { - def _BOTHEN : MUBUF_si <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; - } - } - - let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { - def _ADDR64 : MUBUF_si <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", - [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>; - } - } -} - -multiclass MUBUF_Load_Helper_vi <bits<7> op, string asm, RegisterClass regClass, - ValueType load_vt = i32, - SDPatternOperator ld = null_frag> { - - let lds = 0, mayLoad = 1 in { let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF_vi <op, (outs regClass:$vdata), + defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), (ins SReg_128:$srsrc, mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$glc, i1:$slc, i1:$tfe)))]>, - MUBUFAddr64Table<0>; + i1:$glc, i1:$slc, i1:$tfe)))]>; } let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF_vi <op, (outs regClass:$vdata), + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), (ins SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 0, idxen = 1 in { - def _IDXEN : MUBUF_vi <op, (outs regClass:$vdata), + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), (ins SReg_128:$srsrc, VGPR_32:$vaddr, mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 1, idxen = 1 in { - def _BOTHEN : MUBUF_vi <op, (outs regClass:$vdata), + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_64:$vaddr, SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), + (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, i16:$offset)))]>; } } } -multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, +multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, ValueType store_vt, SDPatternOperator st> { - let mayLoad = 0, mayStore = 1 in { - let addr64 = 0 in { - - def "" : MUBUF_si < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, - mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, - tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# - "$glc"#"$slc"#"$tfe", - [] - >; + defm : MUBUF_m <op, name, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF_si < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", - [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, - i1:$tfe))] - >, MUBUFAddr64Table<0>; + defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; } // offen = 0, idxen = 0, vaddr = 0 let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF_si < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# - "$glc"#"$slc"#"$tfe", - [] - >; + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; } // end offen = 1, idxen = 0 - } // End addr64 = 0 - - def _ADDR64 : MUBUF_si < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", - [(st store_vt:$vdata, - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1> - { - - let mayLoad = 0; - let mayStore = 1; - - // Encoding - let offen = 0; - let idxen = 0; - let glc = 0; - let addr64 = 1; - let slc = 0; - let tfe = 0; - let soffset = 128; // ZERO - } - } // End mayLoad = 0, mayStore = 1 + let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0, + soffset = 128 /* ZERO */ in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>; + } + } // End mayLoad = 0, mayStore = 1 } class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : @@ -1912,7 +2022,7 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> { class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < + RegisterClass src_rc, int wqm> : MIMG < op, (outs dst_rc:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, @@ -1924,33 +2034,41 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, let mayLoad = 1; let mayStore = 0; let hasPostISelHook = 1; + let WQM = wqm; } multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32>, + int channels, int wqm> { + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>, MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>, + def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>, + def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>, + def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>, + def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, MIMG_Mask<asm#"_V16", channels>; } multiclass MIMG_Sampler <bits<7> op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>; + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>; } class MIMG_Gather_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < + RegisterClass src_rc, int wqm> : MIMG < op, (outs dst_rc:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, @@ -1971,28 +2089,36 @@ class MIMG_Gather_Helper <bits<7> op, string asm, // Therefore, disable all code which updates DMASK by setting these two: let MIMG = 0; let hasPostISelHook = 0; + let WQM = wqm; } multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32>, + int channels, int wqm> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>, + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>, + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>, + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>, + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, MIMG_Mask<asm#"_V16", channels>; } multiclass MIMG_Gather <bits<7> op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>; + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Gather_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/R600/SIInstructions.td b/contrib/llvm/lib/Target/R600/SIInstructions.td index 4b1a846..bbedef2 100644 --- a/contrib/llvm/lib/Target/R600/SIInstructions.td +++ b/contrib/llvm/lib/Target/R600/SIInstructions.td @@ -152,9 +152,11 @@ defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", [(set i32:$dst, (ctlz_zero_undef i32:$src0))] >; -//defm S_FLBIT_I32_B64 : SOP1_32 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; -defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>; -//defm S_FLBIT_I32_I64 : SOP1_32 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; +defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", + [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] +>; +defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", [(set i32:$dst, (sext_inreg i32:$src0, i8))] >; @@ -764,88 +766,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; //===----------------------------------------------------------------------===// -def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; -def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; -def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; -def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; -def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; -def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; -def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; -def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; -def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; -def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; -def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; -def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; -def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; -def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; -def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; - -def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; -def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; -def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; -def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; -def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; -def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; -def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; -def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; -def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; -def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; -def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; -def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; -def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; //def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">; //def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">; -def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; -def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let SubtargetPredicate = isCI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; } // End isCI -def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; -def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; -def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; -def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; -def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; -def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; -def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; -def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; -def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; -def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; -def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; -def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; -def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; -def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; -def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; -def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; - -def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; -def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; -def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; -def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; -def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; -def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; -def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; -def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; -def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; -def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; -def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; -def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; -def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; //def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; //def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; -def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; -def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; -def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">; -def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 @@ -874,123 +876,120 @@ defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; -defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; +defm DS_READ2_B64 : DS_Load2_Helper <0x00000077, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000078, "ds_read2st64_b64", VReg_128>; //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isSICI in { - -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>; +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - 0x00000008, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - 0x00000009, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - 0x0000000a, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - 0x0000000b, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - 0x0000000c, "buffer_load_dword", VGPR_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "buffer_store_byte", VGPR_32, i32, truncstorei8_global + mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "buffer_store_short", VGPR_32, i32, truncstorei16_global + mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "buffer_store_dword", VGPR_32, i32, global_store + mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store >; defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store + mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store >; defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store + mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store >; -//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>; + defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - 0x00000030, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global + mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - 0x00000032, "buffer_atomic_add", VGPR_32, i32, atomic_add_global + mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - 0x00000033, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global + mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global >; -//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - 0x00000035, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global + mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global >; defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - 0x00000036, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global + mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global >; defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - 0x00000037, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global + mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global >; defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - 0x00000038, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global + mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global >; defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - 0x00000039, "buffer_atomic_and", VGPR_32, i32, atomic_and_global + mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global >; defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - 0x0000003a, "buffer_atomic_or", VGPR_32, i32, atomic_or_global + mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global >; defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - 0x0000003b, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global ->; -//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>; -//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>; -//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>; -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>; -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>; -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>; -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>; -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>; -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>; - -} // End SubtargetPredicate = isSICI + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI +//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -1037,63 +1036,63 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; @@ -1445,53 +1444,37 @@ defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, fminnum>; defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>; -defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32, - AMDGPUsmin ->; -defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32, - AMDGPUsmax ->; -defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32, - AMDGPUumin ->; -defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32, - AMDGPUumax ->; +defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>; -// No non-Rev Op on VI defm V_LSHRREV_B32 : VOP2Inst < vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshr_b32", "v_lshrrev_b32" + "v_lshr_b32" >; -// No non-Rev OP on VI defm V_ASHRREV_I32 : VOP2Inst < vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, - "v_ashr_i32", "v_ashrrev_i32" + "v_ashr_i32" >; -// No non-Rev OP on VI defm V_LSHLREV_B32 : VOP2Inst < vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshl_b32", "v_lshlrev_b32" + "v_lshl_b32" >; -defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", - VOP_I32_I32_I32, and>; -defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", - VOP_I32_I32_I32, or ->; -defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", - VOP_I32_I32_I32, xor ->; +defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; } // End isCommutable = 1 -defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>; +defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>; +defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; } // End isCommutable = 1 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC @@ -1503,9 +1486,7 @@ let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", VOP_I32_I32_I32, add >; -defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", - VOP_I32_I32_I32, sub ->; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", VOP_I32_I32_I32, null_frag, "v_sub_i32" @@ -1513,10 +1494,10 @@ defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", let Uses = [VCC] in { // Carry-in comes from VCC defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", - VOP_I32_I32_I32_VCC, adde + VOP_I32_I32_I32_VCC >; defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", - VOP_I32_I32_I32_VCC, sube + VOP_I32_I32_I32_VCC >; defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" @@ -1529,47 +1510,41 @@ defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, "v_readlane_b32", (outs SReg_32:$vdst), - (ins VGPR_32:$src0, SSrc_32:$vsrc1), - "v_readlane_b32 $vdst, $src0, $vsrc1" + (ins VGPR_32:$src0, SCSrc_32:$src1), + "v_readlane_b32 $vdst, $src0, $src1" >; defm V_WRITELANE_B32 : VOP2SI_3VI_m < vop3 <0x002, 0x28a>, "v_writelane_b32", (outs VGPR_32:$vdst), - (ins SReg_32:$src0, SSrc_32:$vsrc1), - "v_writelane_b32 $vdst, $src0, $vsrc1" + (ins SReg_32:$src0, SCSrc_32:$src1), + "v_writelane_b32 $vdst, $src0, $src1" >; // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { -let isCommutable = 1 in { -defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", - VOP_F32_F32_F32 ->; -} // End isCommutable = 1 - -defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32", +defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy >; -defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32", +defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy >; let isCommutable = 1 in { -defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>; -defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32", - VOP_I32_I32_I32, sra ->; - -let hasPostISelHook = 1 in { -defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>; -} - +defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>; } // End isCommutable = 1 } // End let SubtargetPredicate = SICI +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 + defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32, AMDGPUbfm >; @@ -1586,14 +1561,25 @@ defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp >; -////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>; -////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>; -////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>; + +defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", + VOP_I32_F32_I32>; // TODO: set "Uses = dst" + +defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32", + VOP_I32_F32_F32 +>; defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 + VOP_I32_F32_F32, int_SI_packf16 +>; +defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32", + VOP_I32_I32_I32 +>; +defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32", + VOP_I32_I32_I32 >; -////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>; -////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>; //===----------------------------------------------------------------------===// // VOP3 Instructions @@ -1659,27 +1645,34 @@ defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", VOP_I32_I32_I32_I32 >; -defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32", +defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32", VOP_F32_F32_F32_F32, AMDGPUfmin3>; -defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32", +defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32", VOP_I32_I32_I32_I32, AMDGPUsmin3 >; -defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32", +defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32", VOP_I32_I32_I32_I32, AMDGPUumin3 >; -defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32", +defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32", VOP_F32_F32_F32_F32, AMDGPUfmax3 >; -defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32", +defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32", VOP_I32_I32_I32_I32, AMDGPUsmax3 >; -defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32", +defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; -//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; -//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; -//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; +defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", + VOP_F32_F32_F32_F32 +>; +defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", + VOP_I32_I32_I32_I32 +>; +defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", + VOP_I32_I32_I32_I32 +>; + //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; @@ -1742,21 +1735,36 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", } // isCommutable = 1, SchedRW = [WriteQuarterRate32] +let SchedRW = [WriteFloatFMA, WriteSALU] in { defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +} -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; } // let SchedRW = [WriteDouble] -let isCommutable = 1 in { -defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", +let isCommutable = 1, Uses = [VCC] in { + +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; + let SchedRW = [WriteDouble] in { -defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; + } // End SchedRW = [WriteDouble] } // End isCommutable = 1 @@ -1774,23 +1782,29 @@ defm V_TRIG_PREOP_F64 : VOP3Inst < // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { -defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", - VOP_I64_I64_I32, shl ->; - -defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", - VOP_I64_I64_I32, srl ->; - -defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", - VOP_I64_I64_I32, sra ->; +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>; +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>; +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>; defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", VOP_F32_F32_F32_F32>; } // End SubtargetPredicate = isSICI +let SubtargetPredicate = isVI in { + +defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", + VOP_I64_I32_I64 +>; +defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64", + VOP_I64_I32_I64 +>; +defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", + VOP_I64_I32_I64 +>; + +} // End SubtargetPredicate = isVI + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1809,8 +1823,8 @@ def SGPR_USE : InstSI <(outs),(ins), "", []>; // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let mayLoad = 1, mayStore = 1, hasSideEffects = 1, - Uses = [EXEC], Defs = [EXEC] in { +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { +let Uses = [EXEC], Defs = [EXEC] in { let isBranch = 1, isTerminator = 1 in { @@ -1867,15 +1881,18 @@ def SI_END_CF : InstSI < [(int_SI_end_cf i64:$saved)] >; +} // End Uses = [EXEC], Defs = [EXEC] + +let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : InstSI < (outs), (ins VSrc_32:$src), "si_kill $src", [(int_AMDGPU_kill f32:$src)] >; +} // End Uses = [EXEC], Defs = [EXEC,VCC] } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 - // Uses = [EXEC], Defs = [EXEC] let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { @@ -2020,16 +2037,12 @@ def : Pat < (SI_KILL 0xbf800000) >; -let Predicates = [isSICI] in { - /* int_SI_vs_load_input */ def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) >; -} // End Predicates = [isSICI] - /* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, @@ -2156,9 +2169,13 @@ def : Pat < //===----------------------------------------------------------------------===// let Predicates = [UnsafeFPMath] in { -def : RcpPat<V_RCP_F64_e32, f64>; -defm : RsqPat<V_RSQ_F64_e32, f64>; -defm : RsqPat<V_RSQ_F32_e32, f32>; + +//def : RcpPat<V_RCP_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F32_e32, f32>; + +def : RsqPat<V_RSQ_F32_e32, f32>; +def : RsqPat<V_RSQ_F64_e32, f64>; } //===----------------------------------------------------------------------===// @@ -2675,13 +2692,6 @@ def : Pat < (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; -def : Pat< - (fdiv f64:$src0, f64:$src1), - (V_MUL_F64 0 /* src0_modifiers */, $src0, - 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1), - 0 /* clamp */, 0 /* omod */) ->; - def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, @@ -2716,16 +2726,12 @@ class Ext32Pat <SDNode ext> : Pat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -let Predicates = [isSICI] in { - // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) >; -} // End Predicates = [isSICI] - // The multiplication scales from [0,1] to the unsigned integer range def : Pat < (AMDGPUurecip i32:$src0), @@ -2907,7 +2913,6 @@ class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) >; -let Predicates = [isSICI] in { def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; @@ -2915,7 +2920,6 @@ def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; -} // End Predicates = [isSICI] // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, @@ -2954,14 +2958,12 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe >; } -let Predicates = [isSICI] in { defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; -} // End Predicates = [isSICI] class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, @@ -2969,13 +2971,11 @@ class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) >; -let Predicates = [isSICI] in { def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; -} // End Predicates = [isSICI] /* class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < @@ -3246,6 +3246,12 @@ def : Pat < >; def : Pat < + (i1 (trunc i64:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (EXTRACT_SUBREG $a, sub0)), 1) +>; + +def : Pat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 0x00ff00ff), (V_ALIGNBIT_B32 $a, $a, 24), @@ -3257,6 +3263,28 @@ def : Pat < (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; +//===----------------------------------------------------------------------===// +// Fract Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp index 068b22f..c319b32 100644 --- a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp @@ -88,7 +88,8 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void LoadM0(MachineInstr &MI, MachineInstr *MovRel); + void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -323,7 +324,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); @@ -333,8 +334,14 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { unsigned Idx = MI.getOperand(3).getReg(); if (AMDGPU::SReg_32RegClass.contains(Idx)) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx) + .addImm(Offset); + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + } MBB.insert(I, MovRel); } else { @@ -363,6 +370,11 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } // Do the actual move MBB.insert(I, MovRel); @@ -384,6 +396,33 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MI.eraseFromParent(); } +/// \param @VecReg The register which holds element zero of the vector +/// being addressed into. +/// \param[out] @Reg The base register to use in the indirect addressing instruction. +/// \param[in,out] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] +// As an output, this is a constant value that needs +// to be added to the value stored in M0. +void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, + unsigned &Reg, + int &Offset) { + unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); + if (!SubReg) + SubReg = VecReg; + + const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); + int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + + if (RegIdx < 0) { + Offset = RegIdx; + RegIdx = 0; + } else { + Offset = 0; + } + + Reg = RC->getRegister(RegIdx); +} + void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); @@ -391,18 +430,18 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { unsigned Dst = MI.getOperand(0).getReg(); unsigned Vec = MI.getOperand(2).getReg(); - unsigned Off = MI.getOperand(4).getImm(); - unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0); - if (!SubReg) - SubReg = Vec; + int Off = MI.getOperand(4).getImm(); + unsigned Reg; + + computeIndirectRegAndOffset(Vec, Reg, Off); MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SubReg + Off) + .addReg(Reg) .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Vec, RegState::Implicit); - LoadM0(MI, MovRel); + LoadM0(MI, MovRel, Off); } void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { @@ -411,20 +450,20 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Off = MI.getOperand(4).getImm(); + int Off = MI.getOperand(4).getImm(); unsigned Val = MI.getOperand(5).getReg(); - unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0); - if (!SubReg) - SubReg = Dst; + unsigned Reg; + + computeIndirectRegAndOffset(Dst, Reg, Off); MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(SubReg + Off, RegState::Define) + .addReg(Reg, RegState::Define) .addReg(Val) .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Dst, RegState::Implicit); - LoadM0(MI, MovRel); + LoadM0(MI, MovRel, Off); } bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { @@ -447,7 +486,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isDS(MI.getOpcode())) + if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. @@ -513,12 +552,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; - - case AMDGPU::V_INTERP_P1_F32: - case AMDGPU::V_INTERP_P2_F32: - case AMDGPU::V_INTERP_MOV_F32: - NeedWQM = true; - break; } } } diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp index 58c2cd1..f502991 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp @@ -14,7 +14,6 @@ #include "SIRegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -47,13 +46,31 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VGPR255); Reserved.set(AMDGPU::VGPR254); + // Tonga and Iceland can only allocate a fixed number of SGPRs due + // to a hw bug. + if (ST.hasSGPRInitBug()) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). + // Assume XNACK_MASK is unused. + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + + for (unsigned i = Limit; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); + } + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU()); + unsigned SGPRLimit = getNumSGPRsAllowed(ST.getGeneration(), + ST.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU()); for (regclass_iterator I = regclass_begin(), E = regclass_end(); @@ -204,7 +221,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) .addReg(SubReg) .addImm(Spill.Lane); @@ -236,7 +255,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (isM0) SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -245,7 +266,22 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg); } } - TII->insertNOPs(MI, 3); + + // TODO: only do this when it is needed + switch (ST.getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI + TII->insertNOPs(MI, 3); + break; + case AMDGPUSubtarget::SEA_ISLANDS: + break; + default: // VOLCANIC_ISLANDS and later + // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI + // and later. This also applies to VALUs which write VCC, but we're + // unlikely to see VMEM use VCC. + TII->insertNOPs(MI, 4); + } + MI->eraseFromParent(); break; } @@ -490,14 +526,24 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { } } -unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { - switch(WaveCount) { - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return 103; +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const { + if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WaveCount) { + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return 102; + } + } else { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } } } diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h index d908ffd..1dfe530 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/Support/Debug.h" namespace llvm { @@ -111,7 +112,8 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount /// concurrent waves. - unsigned getNumSGPRsAllowed(unsigned WaveCount) const; + unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td index 1a1efb0..c63f305 100644 --- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td @@ -209,7 +209,9 @@ def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>; +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 32; +} class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { let OperandNamespace = "AMDGPU"; diff --git a/contrib/llvm/lib/Target/R600/VIInstrFormats.td b/contrib/llvm/lib/Target/R600/VIInstrFormats.td index 5285d18..c242235 100644 --- a/contrib/llvm/lib/Target/R600/VIInstrFormats.td +++ b/contrib/llvm/lib/Target/R600/VIInstrFormats.td @@ -136,6 +136,32 @@ class VOP3e_vi <bits<10> op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP3be_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + class EXPe_vi : EXPe { let Inst{31-26} = 0x31; //encoding } diff --git a/contrib/llvm/lib/Target/R600/VIInstructions.td b/contrib/llvm/lib/Target/R600/VIInstructions.td index 24e66ce..4a6e933 100644 --- a/contrib/llvm/lib/Target/R600/VIInstructions.td +++ b/contrib/llvm/lib/Target/R600/VIInstructions.td @@ -9,18 +9,6 @@ // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isVI in { - -defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi < - 0x14, "buffer_load_dword", VGPR_32, i32, global_load ->; - -defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi < - 0x03, "buffer_load_format_xyzw", VReg_128 ->; - -} // End SubtargetPredicate = isVI - //===----------------------------------------------------------------------===// // SMEM Patterns @@ -28,37 +16,10 @@ defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi < let Predicates = [isVI] in { -// 1. Offset as 8bit DWORD immediate +// 1. Offset as 20bit DWORD immediate def : Pat < (SIload_constant v4i32:$sbase, IMM20bit:$offset), (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// - -// Offset in an 32Bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) ->; - -// Offset in an 32Bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) ->; - -/* int_SI_vs_load_input */ -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_VI_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) ->; - -defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_VI_OFFSET, - BUFFER_LOAD_DWORD_VI_OFFEN, - BUFFER_LOAD_DWORD_VI_IDXEN, - BUFFER_LOAD_DWORD_VI_BOTHEN>; - } // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index f2eb6a8..66f54e9 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -392,12 +392,25 @@ static bool usesTheStack(const MachineFunction &MF) { return false; } -void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, - unsigned &CallOp, - const char *&Symbol) { - CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32; +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool Is64Bit = STI.is64Bit(); + bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + + unsigned CallOp; + if (Is64Bit) + CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; + else + CallOp = X86::CALLpcrel32; - if (STI.is64Bit()) { + const char *Symbol; + if (Is64Bit) { if (STI.isTargetCygMing()) { Symbol = "___chkstk_ms"; } else { @@ -407,6 +420,37 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, Symbol = "_alloca"; else Symbol = "_chkstk"; + + MachineInstrBuilder CI; + + // All current stack probes take AX and SP as input, clobber flags, and + // preserve all registers. x86_64 probes leave RSP unmodified. + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // For the large code model, we have to call through a register. Use R11, + // as it is scratch in all supported calling conventions. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) + .addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + } else { + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + } + + unsigned AX = Is64Bit ? X86::RAX : X86::EAX; + unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + CI.addReg(AX, RegState::Implicit) + .addReg(SP, RegState::Implicit) + .addReg(AX, RegState::Define | RegState::Implicit) + .addReg(SP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themselves. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } } /// emitPrologue - Push callee-saved registers onto the stack, which @@ -739,11 +783,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. if (NumBytes >= StackProbeSize && UseStackProbe) { - const char *StackProbeSymbol; - unsigned CallOp; - - getStackProbeFunction(STI, CallOp, StackProbeSymbol); - // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -772,22 +811,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, - TII.get(CallOp)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; + + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbeCall(MF, MBB, MBBI, DL); + + // Apply the frame setup flag to all inserted instrs. + for (; SetRAX != MBBI; ++SetRAX) + SetRAX->setFlag(MachineInstr::FrameSetup); - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themself. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) - .addReg(StackPtr) - .addReg(X86::RAX) - .setMIFlag(MachineInstr::FrameSetup); - } if (isEAXAlive) { // Restore EAX MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 9cb887a..448a365 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -27,9 +27,11 @@ public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} - static void getStackProbeFunction(const X86Subtarget &STI, - unsigned &CallOp, - const char *&Symbol); + /// Emit a call to the target's stack probe function. This is required for all + /// large stack allocations on Windows. The caller is required to materialize + /// the number of bytes to probe in RAX/EAX. + static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 177299b..85978d8 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15,6 +15,7 @@ #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" @@ -10094,12 +10095,12 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, VT.getVectorNumElements() / 2); // Check for patterns which can be matched with a single insert of a 128-bit // subvector. - if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || - isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + bool OnlyUsesV1 = isShuffleEquivalent(Mask, 0, 1, 0, 1); + if (OnlyUsesV1 || isShuffleEquivalent(Mask, 0, 1, 4, 5)) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { @@ -10112,7 +10113,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, // Otherwise form a 128-bit permutation. // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. - unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + int MaskLO = Mask[0]; + if (MaskLO == SM_SentinelUndef) + MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; + + int MaskHI = Mask[2]; + if (MaskHI == SM_SentinelUndef) + MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; + + unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, MVT::i8)); } @@ -17172,6 +17181,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/instruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: case Intrinsic::x86_avx512_mask_valign_d_512: // Vector source operands are swapped. @@ -21076,47 +21092,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, assert(!Subtarget->isTargetMachO()); - // The lowering is pretty easy: we're just emitting the call to _alloca. The - // non-trivial part is impdef of ESP. - - if (Subtarget->isTargetWin64()) { - if (Subtarget->isTargetCygMing()) { - // ___chkstk(Mingw64): - // Clobbers R10, R11, RAX and EFLAGS. - // Updates RSP. - BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) - .addExternalSymbol("___chkstk") - .addReg(X86::RAX, RegState::Implicit) - .addReg(X86::RSP, RegState::Implicit) - .addReg(X86::RAX, RegState::Define | RegState::Implicit) - .addReg(X86::RSP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - } else { - // __chkstk(MSVCRT): does not update stack pointer. - // Clobbers R10, R11 and EFLAGS. - BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) - .addExternalSymbol("__chkstk") - .addReg(X86::RAX, RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to be subtracted from RSP. - BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); - } - } else { - const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || - Subtarget->isTargetWindowsItanium()) - ? "_chkstk" - : "_alloca"; - - BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol(StackProbeSymbol) - .addReg(X86::EAX, RegState::Implicit) - .addReg(X86::ESP, RegState::Implicit) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::ESP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - } + X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -25558,45 +25534,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } - if (VT.getScalarType() == MVT::i1) { - bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); - bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); - if (!IsSEXT0 && !IsVZero0) - return SDValue(); - bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && - (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + if (VT.getScalarType() == MVT::i1 && + (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { + bool IsSEXT0 = + (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!IsSEXT1 && !IsVZero1) - return SDValue(); + if (!IsSEXT0 || !IsVZero1) { + // Swap the operands and update the condition code. + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + + IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + } if (IsSEXT0 && IsVZero1) { - assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) + assert(VT == LHS.getOperand(0).getValueType() && + "Uexpected operand type"); + if (CC == ISD::SETGT) + return DAG.getConstant(0, VT); + if (CC == ISD::SETLE) + return DAG.getConstant(1, VT); + if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); + + assert((CC == ISD::SETNE || CC == ISD::SETLT) && + "Unexpected condition code!"); return LHS.getOperand(0); } - if (IsSEXT1 && IsVZero0) { - assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) - return DAG.getNOT(DL, RHS.getOperand(0), VT); - return RHS.getOperand(0); - } } return SDValue(); diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 71415c6..7baff19 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -279,7 +279,8 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { } let isCall = 1, isCodeGenOnly = 1 in - // __chkstk(MSVC): clobber R10, R11 and EFLAGS. + // __chkstk(MSVC): clobber R10, R11 and EFLAGS + // ___chkstk_ms(Mingw64): clobber R10, R11 and EFLAGS // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. let Defs = [RAX, R10, R11, RSP, EFLAGS], Uses = [RSP] in { diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 7130ae2..b411d07 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -175,8 +175,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_permd, INTR_TYPE_2OP, X86ISD::VPERMV, 0), - X86_INTRINSIC_DATA(avx2_permps, INTR_TYPE_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), |