diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
491 files changed, 48320 insertions, 2465 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index bffd9e6..6c5a083 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -148,7 +148,7 @@ private: Color getColor(unsigned Register); Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); }; -} +} // namespace char AArch64A57FPLoadBalancing::ID = 0; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index da22d8d..ada995b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -121,7 +121,7 @@ private: //===----------------------------------------------------------------------===// void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { // Funny Darwin hack: This flag tells the linker that no global symbols // contain code that falls through to other global symbols (e.g. the obvious diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp index d973234..176403c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -102,7 +102,7 @@ public: } }; char AArch64BranchRelaxation::ID = 0; -} +} // namespace /// verify - check BBOffsets, BBSizes, alignment of islands void AArch64BranchRelaxation::verify() { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 1e2d1c3..efc328a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -136,6 +136,6 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); } -} +} // namespace #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 06ff9af..11eefc4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -135,7 +135,7 @@ struct LDTLSCleanup : public MachineFunctionPass { MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace char LDTLSCleanup::ID = 0; FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index c2470f7..acb3525 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -43,7 +43,7 @@ private: unsigned BitSize); }; char AArch64ExpandPseudo::ID = 0; -} +} // namespace /// \brief Transfer implicit operands on the pseudo instruction to the /// instructions created from the expansion. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 9977e2b..d1523e8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -1678,6 +1678,9 @@ unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, bool WantZExt, MachineMemOperand *MMO) { + if(!TLI.allowsMisalignedMemoryAccesses(VT)) + return 0; + // Simplify this down to something we can handle. if (!simplifyAddress(Addr, VT)) return 0; @@ -1962,6 +1965,9 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, MachineMemOperand *MMO) { + if(!TLI.allowsMisalignedMemoryAccesses(VT)) + return false; + // Simplify this down to something we can handle. if (!simplifyAddress(Addr, VT)) return false; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h index b496fcc..11227ee 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -63,6 +63,6 @@ public: RegScavenger *RS) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1616ff1..0165ef9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -76,9 +76,6 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); -/// Value type used for condition codes. -static const MVT MVT_CC = MVT::i32; - AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -810,9 +807,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; - case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; - case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; - case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; @@ -1171,133 +1165,10 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) .getValue(1); } -static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, - ISD::CondCode CC, SDValue CCOp, - SDValue Condition, unsigned NZCV, - SDLoc DL, SelectionDAG &DAG) { - unsigned Opcode = 0; - if (LHS.getValueType().isFloatingPoint()) - Opcode = AArch64ISD::FCCMP; - else if (RHS.getOpcode() == ISD::SUB) { - SDValue SubOp0 = RHS.getOperand(0); - if (const ConstantSDNode *SubOp0C = dyn_cast<ConstantSDNode>(SubOp0)) - if (SubOp0C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // See emitComparison() on why we can only do this for SETEQ and SETNE. - Opcode = AArch64ISD::CCMN; - RHS = RHS.getOperand(1); - } - } - if (Opcode == 0) - Opcode = AArch64ISD::CCMP; - - SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); - return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); -} - -/// Returns true if @p Val is a tree of AND/OR/SETCC operations. -static bool isConjunctionDisjunctionTree(const SDValue Val, unsigned Depth) { - if (!Val.hasOneUse()) - return false; - if (Val->getOpcode() == ISD::SETCC) - return true; - // Protect against stack overflow. - if (Depth > 1000) - return false; - if (Val->getOpcode() == ISD::AND || Val->getOpcode() == ISD::OR) { - SDValue O0 = Val->getOperand(0); - SDValue O1 = Val->getOperand(1); - return isConjunctionDisjunctionTree(O0, Depth+1) && - isConjunctionDisjunctionTree(O1, Depth+1); - } - return false; -} - -/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain -/// of CCMP/CFCMP ops. For example (SETCC_0 & SETCC_1) with condition cond0 and -/// cond1 can be transformed into "CMP; CCMP" with CCMP executing on cond_0 -/// and setting flags to inversed(cond_1) otherwise. -/// This recursive function produces DAG nodes that produce condition flags -/// suitable to determine the truth value of @p Val (which is AND/OR/SETCC) -/// by testing the result for the condition set to @p OutCC. If @p Negate is -/// set the opposite truth value is produced. If @p CCOp and @p Condition are -/// given then conditional comparison are created so that false is reported -/// when they are false. -static SDValue emitConjunctionDisjunctionTree( - SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, - SDValue CCOp = SDValue(), AArch64CC::CondCode Condition = AArch64CC::AL) { - assert(isConjunctionDisjunctionTree(Val, 0)); - // We're at a tree leaf, produce a c?f?cmp. - unsigned Opcode = Val->getOpcode(); - if (Opcode == ISD::SETCC) { - SDValue LHS = Val->getOperand(0); - SDValue RHS = Val->getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); - bool isInteger = LHS.getValueType().isInteger(); - if (Negate) - CC = getSetCCInverse(CC, isInteger); - SDLoc DL(Val); - // Determine OutCC and handle FP special case. - if (isInteger) { - OutCC = changeIntCCToAArch64CC(CC); - } else { - assert(LHS.getValueType().isFloatingPoint()); - AArch64CC::CondCode ExtraCC; - changeFPCCToAArch64CC(CC, OutCC, ExtraCC); - // Surpisingly some floating point conditions can't be tested with a - // single condition code. Construct an additional comparison in this case. - // See comment below on how we deal with OR conditions. - if (ExtraCC != AArch64CC::AL) { - SDValue ExtraCmp; - if (!CCOp.getNode()) - ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); - else { - SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC); - // Note that we want the inverse of ExtraCC, so NZCV is not inversed. - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); - ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, - NZCV, DL, DAG); - } - CCOp = ExtraCmp; - Condition = AArch64CC::getInvertedCondCode(ExtraCC); - OutCC = AArch64CC::getInvertedCondCode(OutCC); - } - } - - // Produce a normal comparison if we are first in the chain - if (!CCOp.getNode()) - return emitComparison(LHS, RHS, CC, DL, DAG); - // Otherwise produce a ccmp. - SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC); - AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); - return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, - DAG); - } - - // Construct comparison sequence for the left hand side. - SDValue LHS = Val->getOperand(0); - SDValue RHS = Val->getOperand(1); - - // We can only implement AND-like behaviour here, but negation is free. So we - // use (not (and (not x) (not y))) to implement (or x y). - bool isOr = Val->getOpcode() == ISD::OR; - assert((isOr || Val->getOpcode() == ISD::AND) && "Should have AND or OR."); - Negate ^= isOr; - - AArch64CC::CondCode RHSCC; - SDValue CmpR = - emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, isOr, CCOp, Condition); - SDValue CmpL = - emitConjunctionDisjunctionTree(DAG, LHS, OutCC, isOr, CmpR, RHSCC); - if (Negate) - OutCC = AArch64CC::getInvertedCondCode(OutCC); - return CmpL; -} - static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { SDValue Cmp; @@ -1356,55 +1227,47 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the -1 + // constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure + // both the LHS and RHS are truely zero extended and to make sure the + // transformation is profitable. if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { - const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); - - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the - // -1 constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to - // ensure both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. - if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && - cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, - RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - goto CreateCCNode; + if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && + isa<LoadSDNode>(LHS)) { + if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, + DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); + return Cmp; + } } } - - if ((RHSC->isNullValue() || RHSC->isOne()) && - isConjunctionDisjunctionTree(LHS, 0)) { - bool Negate = (CC == ISD::SETNE) ^ RHSC->isNullValue(); - Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC, Negate); - goto CreateCCNode; - } } - Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); - -CreateCCNode: - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); return Cmp; } @@ -1561,7 +1424,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); - // The the values aren't constants, this isn't the pattern we're looking for. + // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; @@ -2559,7 +2422,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - const Triple TT(getTargetMachine().getTargetTriple()); + const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; @@ -3557,7 +3420,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, EltVT = MVT::i64; VecVT = MVT::v2i64; - // We want to materialize a mask with the the high bit set, but the AdvSIMD + // We want to materialize a mask with the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; @@ -7580,21 +7443,26 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. +// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold +// similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { - // We can handle most types of duplicate, but the lane ones have an extra - // operand saying *which* lane, so we need to know. - bool IsDUPLANE; switch (N.getOpcode()) { case AArch64ISD::DUP: - IsDUPLANE = false; - break; case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: - IsDUPLANE = true; + case AArch64ISD::MOVI: + case AArch64ISD::MOVIshift: + case AArch64ISD::MOVIedit: + case AArch64ISD::MOVImsl: + case AArch64ISD::MVNIshift: + case AArch64ISD::MVNImsl: break; default: + // FMOV could be supported, but isn't very useful, as it would only occur + // if you passed a bitcast' floating point immediate to an eligible long + // integer op (addl, smull, ...). return SDValue(); } @@ -7604,17 +7472,11 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { MVT ElementTy = NarrowTy.getVectorElementType(); unsigned NumElems = NarrowTy.getVectorNumElements(); - MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); + MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); SDLoc dl(N); - SDValue NewDUP; - if (IsDUPLANE) - NewDUP = DAG.getNode(N.getOpcode(), dl, NewDUPVT, N.getOperand(0), - N.getOperand(1)); - else - NewDUP = DAG.getNode(AArch64ISD::DUP, dl, NewDUPVT, N.getOperand(0)); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, NewDUP, + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, + DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), DAG.getConstant(NumElems, dl, MVT::i64)); } @@ -8913,6 +8775,14 @@ static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); } +/// Get rid of unnecessary NVCASTs (that don't change the type). +static SDValue performNVCASTCombine(SDNode *N) { + if (N->getValueType(0) == N->getOperand(0).getValueType()) + return N->getOperand(0); + + return SDValue(); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8955,6 +8825,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); + case AArch64ISD::NVCAST: + return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::INTRINSIC_VOID: @@ -9260,8 +9132,3 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } - -bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, - EVT) const { - return false; -} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index db192c7..da42376 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,11 +58,6 @@ enum NodeType : unsigned { SBCS, ANDS, - // Conditional compares. Operands: left,right,falsecc,cc,flags - CCMP, - CCMN, - FCCMP, - // Floating point comparison FCMP, @@ -513,8 +508,6 @@ private: bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; - - bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; }; namespace AArch64 { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1fe9c7f..2c52f34 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -525,13 +525,6 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_31Operand; } -// True if the 32-bit immediate is in the range [0,31] -def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ - return ((uint64_t)Imm) < 32; -}]> { - let ParserMatchClass = Imm0_31Operand; -} - // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) < 16; @@ -549,9 +542,7 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 16; -}]> { - let ParserMatchClass = Imm0_15Operand; -} +}]>; // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr @@ -2077,12 +2068,9 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, - string mnemonic, SDNode OpNode> - : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), - mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", - [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), - (i32 imm:$cond), NZCV))]>, +class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> + : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2102,13 +2090,19 @@ class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, let Inst{3-0} = nzcv; } +multiclass CondSetFlagsImm<bit op, string asm> { + def Wi : BaseCondSetFlagsImm<op, GPR32, asm> { + let Inst{31} = 0; + } + def Xi : BaseCondSetFlagsImm<op, GPR64, asm> { + let Inst{31} = 1; + } +} + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, - SDNode OpNode> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), - mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", - [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), - (i32 imm:$cond), NZCV))]>, +class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2128,19 +2122,11 @@ class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, let Inst{3-0} = nzcv; } -multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> { - // immediate operand variants - def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> { +multiclass CondSetFlagsReg<bit op, string asm> { + def Wr : BaseCondSetFlagsReg<op, GPR32, asm> { let Inst{31} = 0; } - def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> { - let Inst{31} = 1; - } - // register operand variants - def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> { - let Inst{31} = 0; - } - def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> { + def Xr : BaseCondSetFlagsReg<op, GPR64, asm> { let Inst{31} = 1; } } @@ -3948,14 +3934,11 @@ multiclass FPComparison<bit signalAllNans, string asm, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, - string mnemonic, list<dag> pat> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), - mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, +class BaseFPCondComparison<bit signalAllNans, + RegisterClass regtype, string asm> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, Sched<[WriteFCmp]> { - let Uses = [NZCV]; - let Defs = [NZCV]; - bits<5> Rn; bits<5> Rm; bits<4> nzcv; @@ -3971,18 +3954,16 @@ class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, let Inst{3-0} = nzcv; } -multiclass FPCondComparison<bit signalAllNans, string mnemonic, - SDPatternOperator OpNode = null_frag> { - def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic, - [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv), - (i32 imm:$cond), NZCV))]> { +multiclass FPCondComparison<bit signalAllNans, string asm> { + let Defs = [NZCV], Uses = [NZCV] in { + def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> { let Inst{22} = 0; } - def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic, - [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv), - (i32 imm:$cond), NZCV))]> { + + def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> { let Inst{22} = 1; } + } // Defs = [NZCV], Uses = [NZCV] } //--- diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 6941a6b..8d8864c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -255,7 +255,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { void AArch64InstrInfo::instantiateCondBranch( MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, - const SmallVectorImpl<MachineOperand> &Cond) const { + ArrayRef<MachineOperand> Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); @@ -272,7 +272,7 @@ void AArch64InstrInfo::instantiateCondBranch( unsigned AArch64InstrInfo::InsertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const { + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -369,7 +369,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, } bool AArch64InstrInfo::canInsertSelect( - const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond, + const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Check register classes. @@ -412,7 +412,7 @@ bool AArch64InstrInfo::canInsertSelect( void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DstReg, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -629,8 +629,8 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // base registers are identical, and the offset of a lower memory access + // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. - if (getLdStBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && - getLdStBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { + if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && + getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { if (BaseRegA == BaseRegB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; @@ -1310,9 +1310,9 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { } bool -AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const { +AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { switch (LdSt->getOpcode()) { default: return false; @@ -1336,7 +1336,7 @@ AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, }; } -bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth( +bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, const TargetRegisterInfo *TRI) const { // Handle only loads/stores with base register followed by immediate offset. @@ -1434,7 +1434,7 @@ bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth( /// Detect opportunities for ldp/stp formation. /// -/// Only called for LdSt for which getLdStBaseRegImmOfs returns true. +/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const { @@ -1443,7 +1443,7 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, return false; if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) return false; - // getLdStBaseRegImmOfs guarantees that oper 2 isImm. + // getMemOpBaseRegImmOfs guarantees that oper 2 isImm. unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); // Allow 6 bits of positive range. if (Ofs1 > 64) @@ -2459,15 +2459,15 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, return true; } -/// hasPattern - return true when there is potentially a faster code sequence +/// Return true when there is potentially a faster code sequence /// for an instruction chain ending in \p Root. All potential patterns are /// listed /// in the \p Pattern vector. Pattern should be sorted in priority order since /// the pattern evaluator stops checking as soon as it finds a faster sequence. -bool AArch64InstrInfo::hasPattern( +bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const { + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -2495,76 +2495,76 @@ bool AArch64InstrInfo::hasPattern( "ADDWrr does not have register operands"); if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2); Found = true; } break; case AArch64::ADDXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2); Found = true; } break; case AArch64::SUBWrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); Found = true; } break; case AArch64::SUBXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); Found = true; } break; case AArch64::ADDWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); Found = true; } break; case AArch64::ADDXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); Found = true; } break; case AArch64::SUBWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); Found = true; } break; case AArch64::SUBXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); Found = true; } break; @@ -2667,7 +2667,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, return MUL; } -/// genAlternativeCodeSequence - when hasPattern() finds a pattern +/// When getMachineCombinerPatterns() finds potential patterns, /// this function generates the instructions that could replace the /// original code sequence void AArch64InstrInfo::genAlternativeCodeSequence( diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index d296768..68c2a28 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -90,13 +90,13 @@ public: /// Hint that pairing the given load or store is unprofitable. void suppressLdStPair(MachineInstr *MI) const; - bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const override; - bool getLdStBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, - int &Offset, int &Width, - const TargetRegisterInfo *TRI) const; + bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, + int &Offset, int &Width, + const TargetRegisterInfo *TRI) const; bool enableClusterLoads() const override { return true; } @@ -140,17 +140,14 @@ public: bool AllowModify = false) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; - bool canInsertSelect(const MachineBasicBlock &, - const SmallVectorImpl<MachineOperand> &Cond, unsigned, - unsigned, int &, int &, int &) const override; + bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int &, int &, int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - DebugLoc DL, unsigned DstReg, - const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; @@ -166,19 +163,17 @@ public: unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr *MI) const override; - /// hasPattern - return true when there is potentially a faster code sequence + /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in <Root>. All potential patterns are - /// listed - /// in the <Pattern> array. - bool hasPattern(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) + /// listed in the <Patterns> array. + bool getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const override; - /// genAlternativeCodeSequence - when hasPattern() finds a pattern - /// this function generates the instructions that could replace the - /// original code sequence + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; @@ -189,7 +184,7 @@ public: private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, - const SmallVectorImpl<MachineOperand> &Cond) const; + ArrayRef<MachineOperand> Cond) const; }; /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2f1b893..653f802 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -66,20 +66,6 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisVT<4, i32>]>; -def SDT_AArch64CCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, - SDTCisInt<1>, - SDTCisSameAs<1, 2>, - SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; -def SDT_AArch64FCCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, - SDTCisFP<1>, - SDTCisSameAs<1, 2>, - SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; def SDT_AArch64FCmp : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>; @@ -174,10 +160,6 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; -def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>; -def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>; -def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; - def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; @@ -1036,10 +1018,13 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; //===----------------------------------------------------------------------===// -// Conditional comparison instructions. +// Conditionally set flags instructions. //===----------------------------------------------------------------------===// -defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>; -defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>; +defm CCMN : CondSetFlagsImm<0, "ccmn">; +defm CCMP : CondSetFlagsImm<1, "ccmp">; + +defm CCMN : CondSetFlagsReg<0, "ccmn">; +defm CCMP : CondSetFlagsReg<1, "ccmp">; //===----------------------------------------------------------------------===// // Conditional select instructions. @@ -2569,7 +2554,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; //===----------------------------------------------------------------------===// defm FCCMPE : FPCondComparison<1, "fccmpe">; -defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>; +defm FCCMP : FPCondComparison<0, "fccmp">; //===----------------------------------------------------------------------===// // Floating point conditional select instruction. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 186e71a..82f77a7 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -623,7 +623,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // and first alias with the second, we can combine the second into the // first. if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !UsedRegs[MI->getOperand(0).getReg()] && + !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) && !mayAlias(MI, MemInsns, TII)) { MergeForward = false; return MBBI; @@ -634,7 +634,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // first and the second alias with the first, we can combine the first // into the second. if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !UsedRegs[FirstMI->getOperand(0).getReg()] && + !(FirstMI->mayLoad() && + UsedRegs[FirstMI->getOperand(0).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { MergeForward = true; return MBBI; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h index 1e29b80..908f66f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h @@ -47,6 +47,6 @@ public: MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 536a8d0..2a0f0a4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -158,6 +158,6 @@ private: MILOHContainer LOHContainerSet; SetOfInstructions LOHRelated; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 5394875..bab8463 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -154,7 +154,7 @@ bool haveSameParity(unsigned reg1, unsigned reg2) { return isOdd(reg1) == isOdd(reg2); } -} +} // namespace bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h index 4f656f9..c83aea4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -33,6 +33,6 @@ private: // Add constraints between existing chains void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); }; -} +} // namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 11932d2..a993b60 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -28,6 +28,6 @@ public: unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 85b44a2..e8165a8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -57,7 +57,7 @@ private: } }; char AArch64StorePairSuppress::ID = 0; -} // anonymous +} // namespace FunctionPass *llvm::createAArch64StorePairSuppressPass() { return new AArch64StorePairSuppress(); @@ -142,7 +142,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { continue; unsigned BaseReg; unsigned Offset; - if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { + if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 0b97af8..554826b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -42,14 +42,12 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { return *this; } -AArch64Subtarget::AArch64Subtarget(const std::string &TT, - const std::string &CPU, +AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), - HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), + HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index 5454b20..c9b54cc 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -29,6 +29,7 @@ namespace llvm { class GlobalValue; class StringRef; +class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: @@ -71,7 +72,7 @@ private: public: /// This constructor initializes the data members to match that /// of the specified triple. - AArch64Subtarget(const std::string &TT, const std::string &CPU, + AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian); @@ -90,7 +91,7 @@ public: } const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } - bool enablePostMachineScheduler() const override { + bool enablePostRAScheduler() const override { return isCortexA53() || isCortexA57(); } @@ -150,6 +151,6 @@ public: std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index f23dd33..5496a50 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -110,9 +110,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { } // Helper function to build a DataLayout string -static std::string computeDataLayout(StringRef TT, bool LittleEndian) { - Triple Triple(TT); - if (Triple.isOSBinFormatMachO()) +static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { + if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; if (LittleEndian) return "e-m:e-i64:64-i128:128-n32:64-S128"; @@ -121,7 +120,7 @@ static std::string computeDataLayout(StringRef TT, bool LittleEndian) { /// TargetMachine ctor - Create an AArch64 architecture model. /// -AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, +AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -131,7 +130,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, // initialized before TLInfo is constructed. : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, Options, RM, CM, OL), - TLOF(createTLOF(Triple(getTargetTriple()))), + TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { initAsmInfo(); } @@ -156,28 +155,27 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle); + I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, + isLittle); } return I.get(); } void AArch64leTargetMachine::anchor() { } -AArch64leTargetMachine:: -AArch64leTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +AArch64leTargetMachine::AArch64leTargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void AArch64beTargetMachine::anchor() { } -AArch64beTargetMachine:: -AArch64beTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +AArch64beTargetMachine::AArch64beTargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { /// AArch64 Code Generator Pass Configuration Options. @@ -269,7 +267,7 @@ bool AArch64PassConfig::addInstSelector() { // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many // references to _TLS_MODULE_BASE_ as possible. - if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && + if (TM->getTargetTriple().isOSBinFormatELF() && getOptLevel() != CodeGenOpt::None) addPass(createAArch64CleanupLocalDynamicTLSPass()); @@ -324,6 +322,6 @@ void AArch64PassConfig::addPreEmitPass() { // range of their destination. addPass(createAArch64BranchRelaxation()); if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && - Triple(TM->getTargetTriple()).isOSBinFormatMachO()) + TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h index ec34fad..8d49a29 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -27,7 +27,7 @@ protected: mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap; public: - AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU, + AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool IsLittleEndian); @@ -54,7 +54,7 @@ private: class AArch64leTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: - AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU, + AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); @@ -65,7 +65,7 @@ public: class AArch64beTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: - AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU, + AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index eb05ed9..82bc949 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -52,7 +52,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) { /// returns zero and isBranch is Success then a symbol look up for /// Address + Value is done and if a symbol is found an MCExpr is created with /// that, else an MCExpr with Address + Value is created. If GetOpInfo() -/// returns zero and isBranch is Fail then the the Opcode of the MCInst is +/// returns zero and isBranch is Fail then the Opcode of the MCInst is /// tested and for ADRP an other instructions that help to load of pointers /// a symbol look up is done to see it is returns a specific reference type /// to add to the comment stream. This function returns Success if it adds diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 96fbe3a..7f56c2c 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -1358,7 +1358,7 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, StringRef Name = AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid); if (Valid) - O << StringRef(Name.str()).upper(); + O << Name.upper(); else O << "#" << Val; } diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 15dee97..19544ac 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -181,6 +181,6 @@ public: static const char *getRegisterName(unsigned RegNo, unsigned AltIdx = AArch64::NoRegAltName); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 6c15bf3..3e982ee 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -293,7 +293,7 @@ enum CompactUnwindEncodings { UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800 }; -} // end CU namespace +} // namespace CU // FIXME: This should be in a separate file. class DarwinAArch64AsmBackend : public AArch64AsmBackend { @@ -517,14 +517,13 @@ void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); } -} +} // namespace MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - Triple TheTriple(TT); - - if (TheTriple.isOSDarwin()) + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { + if (TheTriple.isOSBinFormatMachO()) return new DarwinAArch64AsmBackend(T, MRI); assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); @@ -533,10 +532,9 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, } MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - Triple TheTriple(TT); - + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 1f516d1..807679f 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -34,7 +34,7 @@ protected: private: }; -} +} // namespace AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian) diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 78837de..bbcbf51 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -208,9 +208,9 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, MCTargetStreamer * createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { - Triple TT(STI.getTargetTriple()); + const Triple &TT = STI.getTargetTriple(); if (TT.getObjectFormat() == Triple::ELF) return new AArch64TargetELFStreamer(S); return nullptr; } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index f89a852..099d1b0 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -41,7 +41,7 @@ static MCInstrInfo *createAArch64MCInstrInfo() { } static MCSubtargetInfo * -createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { +createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); if (CPU.empty()) @@ -60,7 +60,7 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) { static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, const Triple &TheTriple) { MCAsmInfo *MAI; - if (TheTriple.isOSDarwin()) + if (TheTriple.isOSBinFormatMachO()) MAI = new AArch64MCAsmInfoDarwin(); else { assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 4705bdf..ca56f63 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -43,11 +43,11 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createAArch64leAsmBackend(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCAsmBackend *createAArch64beAsmBackend(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCObjectWriter *createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, @@ -65,7 +65,7 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI); -} // End llvm namespace +} // namespace llvm // Defines symbolic names for AArch64 registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 67af810..b2f5bf3 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -38,7 +38,7 @@ public: const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; }; -} +} // namespace bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, @@ -287,7 +287,7 @@ void AArch64MachObjectWriter::recordRelocation( if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) { const MCSection &Sec = Symbol->getSection(); if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) - Asm.addLocalUsedInReloc(*Symbol); + Symbol->setUsedInReloc(); } const MCSymbol *Base = Asm.getAtom(*Symbol); diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7e42f8e..40071f6 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -346,7 +346,7 @@ namespace AArch64AT { ATMapper(); }; -} +} // namespace AArch64AT namespace AArch64DB { enum DBValues { Invalid = -1, @@ -369,7 +369,7 @@ namespace AArch64DB { DBarrierMapper(); }; -} +} // namespace AArch64DB namespace AArch64DC { enum DCValues { @@ -390,7 +390,7 @@ namespace AArch64DC { DCMapper(); }; -} +} // namespace AArch64DC namespace AArch64IC { enum ICValues { @@ -410,7 +410,7 @@ namespace AArch64IC { static inline bool NeedsRegister(ICValues Val) { return Val == IVAU; } -} +} // namespace AArch64IC namespace AArch64ISB { enum ISBValues { @@ -422,7 +422,7 @@ namespace AArch64ISB { ISBMapper(); }; -} +} // namespace AArch64ISB namespace AArch64PRFM { enum PRFMValues { @@ -452,7 +452,7 @@ namespace AArch64PRFM { PRFMMapper(); }; -} +} // namespace AArch64PRFM namespace AArch64PState { enum PStateValues { @@ -471,7 +471,7 @@ namespace AArch64PState { PStateMapper(); }; -} +} // namespace AArch64PState namespace AArch64SE { enum ShiftExtSpecifiers { @@ -492,7 +492,7 @@ namespace AArch64SE { SXTW, SXTX }; -} +} // namespace AArch64SE namespace AArch64Layout { enum VectorLayout { @@ -514,7 +514,7 @@ namespace AArch64Layout { VL_S, VL_D }; -} +} // namespace AArch64Layout inline static const char * AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) { @@ -1221,7 +1221,7 @@ namespace AArch64SysReg { }; uint32_t ParseGenericRegister(StringRef Name, bool &Valid); -} +} // namespace AArch64SysReg namespace AArch64TLBI { enum TLBIValues { @@ -1283,7 +1283,7 @@ namespace AArch64TLBI { return true; } } -} +} // namespace AArch64TLBI namespace AArch64II { /// Target Operand Flag enum. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h new file mode 100644 index 0000000..0a05d25 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -0,0 +1,148 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H +#define LLVM_LIB_TARGET_R600_AMDGPU_H + +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUInstrPrinter; +class AMDGPUSubtarget; +class AMDGPUTargetMachine; +class FunctionPass; +class MCAsmInfo; +class raw_ostream; +class Target; +class TargetMachine; + +// R600 Passes +FunctionPass *createR600VectorRegMerger(TargetMachine &tm); +FunctionPass *createR600TextureIntrinsicsReplacer(); +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600EmitClauseMarkers(); +FunctionPass *createR600ClauseMergePass(TargetMachine &tm); +FunctionPass *createR600Packetizer(TargetMachine &tm); +FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); +FunctionPass *createAMDGPUCFGStructurizerPass(); + +// SI Passes +FunctionPass *createSITypeRewriter(); +FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRLiveRangesPass(); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); +FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIPrepareScratchRegs(); + +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; + +void initializeSILowerI1CopiesPass(PassRegistry &); +extern char &SILowerI1CopiesID; + +void initializeSILoadStoreOptimizerPass(PassRegistry &); +extern char &SILoadStoreOptimizerID; + +// Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); +Pass *createAMDGPUStructurizeCFGPass(); +FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +ModulePass *createAMDGPUAlwaysInlinePass(); + +void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); +extern char &SIFixControlFlowLiveIntervalsID; + +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +namespace AMDGPU { +enum TargetIndex { + TI_CONSTDATA_START, + TI_SCRATCH_RSRC_DWORD0, + TI_SCRATCH_RSRC_DWORD1, + TI_SCRATCH_RSRC_DWORD2, + TI_SCRATCH_RSRC_DWORD3 +}; +} + +#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" + +} // End namespace llvm + +namespace ShaderType { + enum Type { + PIXEL = 0, + VERTEX = 1, + GEOMETRY = 2, + COMPUTE = 3 + }; +} + +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a separate piece of memory that is unique from other +/// memory locations. +namespace AMDGPUAS { +enum AddressSpaces : unsigned { + PRIVATE_ADDRESS = 0, ///< Address space for private memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + FLAT_ADDRESS = 4, ///< Address space for flat memory. + REGION_ADDRESS = 5, ///< Address space for region memory. + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this + // order to be able to dynamically index a constant buffer, for example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + ADDRESS_NONE = 24, ///< Address space for unknown memory. + LAST_ADDRESS = ADDRESS_NONE, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u +}; + +} // namespace AMDGPUAS + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td new file mode 100644 index 0000000..2e7e39a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -0,0 +1,266 @@ +//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// Subtarget Features +//===----------------------------------------------------------------------===// + +// Debugging Features + +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", + "EnableIRStructurizer", + "false", + "Disable IR Structurizer">; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass">; + +// Target features + +def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", + "EnableIfCvt", + "false", + "Disable the if conversion pass">; + +def FeatureFP64 : SubtargetFeature<"fp64", + "FP64", + "true", + "Enable double precision operations">; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64]>; + +def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add", + []>; + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "Is64bit", + "true", + "Specify if 64-bit addressing should be used">; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding">; + +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache">; + +def FeatureCaymanISA : SubtargetFeature<"caymanISA", + "CaymanISA", + "true", + "Use Cayman ISA">; + +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug">; + +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass">; + +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory">; + +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size">; + +class SubtargetFeatureFetchLimit <string Value> : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value>; + +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast<string>(Value), + "The number of threads per wavefront">; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + +class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < + "ldsbankcount"#Value, + "LDSBankCount", + !cast<string>(Value), + "The number of LDS banks per compute unit.">; + +def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; +def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; + +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes">; + +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU">; + +def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", + "GCN1Encoding", + "true", + "Encoding format for SI and CI">; + +def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", + "GCN3Encoding", + "true", + "Encoding format for VI">; + +def FeatureCIInsts : SubtargetFeature<"ci-insts", + "CIInsts", + "true", + "Additional intstructions for CI+">; + +// Dummy feature used to disable assembler instructions. +def FeatureDisable : SubtargetFeature<"", + "FeatureDisable","true", + "Dummy feature to disable assembler" + " instructions">; + +class SubtargetFeatureGeneration <string Value, + list<SubtargetFeature> Implies> : + SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, + Value#" GPU generation", Implies>; + +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + +def FeatureR600 : SubtargetFeatureGeneration<"R600", + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; + +def FeatureR700 : SubtargetFeatureGeneration<"R700", + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; + +def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; + +def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; + +def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32]>; + +def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts]>; + +def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; + +//===----------------------------------------------------------------------===// + +def AMDGPUInstrInfo : InstrInfo { + let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; +} + +def AMDGPUAsmParser : AsmParser { + // Some of the R600 registers have the same name, so this crashes. + // For example T0_XYZW and T0_XY both have the asm name T0. + let ShouldEmitMatchRegisterName = 0; +} + +def AMDGPU : Target { + // Pull in Instruction Info: + let InstructionSet = AMDGPUInstrInfo; + let AssemblyParsers = [AMDGPUAsmParser]; +} + +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Predicate helper class +//===----------------------------------------------------------------------===// + +def TruePredicate : Predicate<"true">; +def isSICI : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>, AssemblerPredicate<"FeatureGCN1Encoding">; + +class PredicateControl { + Predicate SubtargetPredicate; + Predicate SIAssemblerPredicate = isSICI; + list<Predicate> AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; + list<Predicate> OtherPredicates = []; + list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + AssemblerPredicates, + OtherPredicates); +} + +// Include AMDGPU TD files +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" +include "AMDGPUCallingConv.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp new file mode 100644 index 0000000..0b426bc --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -0,0 +1,67 @@ +//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks all internal functions as always_inline and creates +/// duplicates of all other functions a marks the duplicates as always_inline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +class AMDGPUAlwaysInline : public ModulePass { + + static char ID; + +public: + AMDGPUAlwaysInline() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } +}; + +} // End anonymous namespace + +char AMDGPUAlwaysInline::ID = 0; + +bool AMDGPUAlwaysInline::runOnModule(Module &M) { + + std::vector<Function*> FuncsToClone; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(Attribute::NoInline)) + FuncsToClone.push_back(&F); + } + + for (Function *F : FuncsToClone) { + ValueToValueMapTy VMap; + Function *NewFunc = CloneFunction(F, VMap, false); + NewFunc->setLinkage(GlobalValue::InternalLinkage); + F->getParent()->getFunctionList().push_back(NewFunc); + F->replaceAllUsesWith(NewFunc); + } + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { + F.addFnAttr(Attribute::AlwaysInline); + } + } + return false; +} + +ModulePass *llvm::createAMDGPUAlwaysInlinePass() { + return new AMDGPUAlwaysInline(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp new file mode 100644 index 0000000..afc6bcb --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -0,0 +1,602 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUAsmPrinter.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "AMDGPU.h" +#include "AMDKernelCodeT.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +using namespace llvm; + +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +// +// FIXME: It seems some instructions do not support single precision denormals +// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, +// and sin_f32, cos_f32 on most parts). + +// We want to use these instructions, and using fp32 denormals also causes +// instructions to run at the double precision rate for the device so it's +// probably best to just report no single precision denormals. +static uint32_t getFPMode(const MachineFunction &F) { + const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>(); + // TODO: Is there any real use for the flush in only / flush out only modes? + + uint32_t FP32Denormals = + ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + uint32_t FP64Denormals = + ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP32Denormals) | + FP_DENORM_MODE_DP(FP64Denormals); +} + +static AsmPrinter * +createAMDGPUAsmPrinterPass(TargetMachine &tm, + std::unique_ptr<MCStreamer> &&Streamer) { + return new AMDGPUAsmPrinter(tm, std::move(Streamer)); +} + +extern "C" void LLVMInitializeAMDGPUAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); +} + +AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)) {} + +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + + // This label is used to mark the end of the .text section. + const TargetLoweringObjectFile &TLOF = getObjFileLowering(); + OutStreamer->SwitchSection(TLOF.getTextSection()); + MCSymbol *EndOfTextLabel = + OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + OutStreamer->EmitLabel(EndOfTextLabel); +} + +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + // The starting address of all shader programs must be 256 bytes aligned. + MF.setAlignment(8); + + SetupMachineFunction(MF); + + MCContext &Context = getObjFileLowering().getContext(); + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + SIProgramInfo KernelInfo; + if (STM.isAmdHsaOS()) { + getSIProgramInfo(KernelInfo, MF); + EmitAmdKernelCodeT(MF, KernelInfo); + OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1)); + } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); + EmitProgramInfoSI(MF, KernelInfo); + } else { + EmitProgramInfoR600(MF); + } + + DisasmLines.clear(); + HexLines.clear(); + DisasmLineMaxLen = 0; + + EmitFunctionBody(); + + if (isVerbose()) { + MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(CommentSection); + + if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + OutStreamer->emitRawComment(" Kernel info:", false); + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), + false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), + false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), + false); + OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); + } else { + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + OutStreamer->emitRawComment( + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); + } + } + + if (STM.dumpCode()) { + + OutStreamer->SwitchSection( + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); + + for (size_t i = 0; i < DisasmLines.size(); ++i) { + std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; + + OutStreamer->EmitBytes(StringRef(DisasmLines[i])); + OutStreamer->EmitBytes(StringRef(Comment)); + } + } + + return false; +} + +void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const R600RegisterInfo *RI = + static_cast<const R600RegisterInfo *>(STM.getRegisterInfo()); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + + unsigned RsrcReg; + if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + // Evergreen / Northern Islands + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + } + } else { + // R600 / R700 + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::GEOMETRY: // Fall through + case ShaderType::COMPUTE: // Fall through + case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + } + } + + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + S_STACK_SIZE(MFI->StackSize), 4); + OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + } +} + +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + uint64_t CodeSize = 0; + unsigned MaxSGPR = 0; + unsigned MaxVGPR = 0; + bool VCCUsed = false; + bool FlatUsed = false; + const SIRegisterInfo *RI = + static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: CodeSize should account for multiple functions. + CodeSize += MI.getDesc().Size; + + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + unsigned width = 0; + bool isSGPR = false; + + if (!MO.isReg()) { + continue; + } + unsigned reg = MO.getReg(); + if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || + reg == AMDGPU::VCC_HI) { + VCCUsed = true; + continue; + } else if (reg == AMDGPU::FLAT_SCR || + reg == AMDGPU::FLAT_SCR_LO || + reg == AMDGPU::FLAT_SCR_HI) { + FlatUsed = true; + continue; + } + + switch (reg) { + default: break; + case AMDGPU::SCC: + case AMDGPU::EXEC: + case AMDGPU::M0: + continue; + } + + if (AMDGPU::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(reg)) { + isSGPR = false; + width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(reg)) { + isSGPR = false; + width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(reg)) { + isSGPR = true; + width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(reg)) { + isSGPR = false; + width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned hwReg = RI->getEncodingValue(reg) & 0xff; + unsigned maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; + } else { + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; + } + } + } + } + + if (VCCUsed) + MaxSGPR += 2; + + if (FlatUsed) + MaxSGPR += 2; + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR = MaxSGPR + 1; + + if (STM.hasSGPRInitBug()) { + if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many SGPRs used with the SGPR init bug"); + } + + ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + } + + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; + ProgInfo.CodeLen = CodeSize; + + unsigned LDSAlignShift; + if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + // LDS is allocated in 64 dword blocks. + LDSAlignShift = 8; + } else { + // LDS is allocated in 128 dword blocks. + LDSAlignShift = 9; + } + + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); + + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. ProgInfo.ScratchSize is the amount of + // scratch memory used per thread. + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | + S_00B84C_TGID_X_EN(1) | + S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + + OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + + OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. + } else { + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + if (STM.isVGPRSpillingEnabled(MFI)) { + OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + } + } + + if (MFI->getShaderType() == ShaderType::PIXEL) { + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); + } +} + +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + amd_kernel_code_t header; + + memset(&header, 0, sizeof(header)); + + header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; + header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; + + header.struct_byte_size = sizeof(amd_kernel_code_t); + + header.target_chip = STM.getAmdKernelCodeChipID(); + + header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); + + header.compute_pgm_resource_registers = + KernelInfo.ComputePGMRSrc1 | + (KernelInfo.ComputePGMRSrc2 << 32); + + // Code Properties: + header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | + AMD_CODE_PROPERTY_IS_PTR64; + + if (KernelInfo.FlatUsed) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + if (KernelInfo.ScratchBlocks) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + + // MFI->ABIArgOffset is the number of bytes for the kernel arguments + // plus 36. 36 is the number of bytes reserved at the begining of the + // input buffer to store work-group size information. + // FIXME: We should be adding the size of the implicit arguments + // to this value. + header.kernarg_segment_byte_size = MFI->ABIArgOffset; + + header.wavefront_sgpr_count = KernelInfo.NumSGPR; + header.workitem_vgpr_count = KernelInfo.NumVGPR; + + // FIXME: What values do I put for these alignments + header.kernarg_segment_alignment = 0; + header.group_segment_alignment = 0; + header.private_segment_alignment = 0; + + header.code_type = 1; // HSA_EXT_CODE_KERNEL + + header.wavefront_size = STM.getWavefrontSize(); + + MCSectionELF *VersionSection = + OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(VersionSection); + OutStreamer->EmitBytes(Twine("HSA Code Unit:" + + Twine(header.hsail_version_major) + "." + + Twine(header.hsail_version_minor) + ":" + + "AMD:" + + Twine(header.amd_code_version_major) + "." + + Twine(header.amd_code_version_minor) + ":" + + "GFX8.1:0").str()); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + + if (isVerbose()) { + OutStreamer->emitRawComment("amd_code_version_major = " + + Twine(header.amd_code_version_major), false); + OutStreamer->emitRawComment("amd_code_version_minor = " + + Twine(header.amd_code_version_minor), false); + OutStreamer->emitRawComment("struct_byte_size = " + + Twine(header.struct_byte_size), false); + OutStreamer->emitRawComment("target_chip = " + + Twine(header.target_chip), false); + OutStreamer->emitRawComment(" compute_pgm_rsrc1: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc1), + false); + OutStreamer->emitRawComment(" compute_pgm_rsrc2: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc2), + false); + OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); + OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); + OutStreamer->emitRawComment("private_element_size = 2 ", false); + OutStreamer->emitRawComment("is_ptr64 = " + + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); + OutStreamer->emitRawComment("workitem_private_segment_byte_size = " + + Twine(header.workitem_private_segment_byte_size), + false); + OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " + + Twine(header.workgroup_group_segment_byte_size), + false); + OutStreamer->emitRawComment("gds_segment_byte_size = " + + Twine(header.gds_segment_byte_size), false); + OutStreamer->emitRawComment("kernarg_segment_byte_size = " + + Twine(header.kernarg_segment_byte_size), false); + OutStreamer->emitRawComment("wavefront_sgpr_count = " + + Twine(header.wavefront_sgpr_count), false); + OutStreamer->emitRawComment("workitem_vgpr_count = " + + Twine(header.workitem_vgpr_count), false); + OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false); + OutStreamer->emitRawComment("wavefront_size = " + + Twine((int)header.wavefront_size), false); + OutStreamer->emitRawComment("optimization_level = " + + Twine(header.optimization_level), false); + OutStreamer->emitRawComment("hsail_profile = " + + Twine(header.hsail_profile), false); + OutStreamer->emitRawComment("hsail_machine_model = " + + Twine(header.hsail_machine_model), false); + OutStreamer->emitRawComment("hsail_version_major = " + + Twine(header.hsail_version_major), false); + OutStreamer->emitRawComment("hsail_version_minor = " + + Twine(header.hsail_version_minor), false); + } + + OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header))); +} + +bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'r': + break; + } + } + + AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, + *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h new file mode 100644 index 0000000..9207251 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -0,0 +1,113 @@ +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" +#include <vector> + +namespace llvm { + +class AMDGPUAsmPrinter : public AsmPrinter { +private: + struct SIProgramInfo { + SIProgramInfo() : + VGPRBlocks(0), + SGPRBlocks(0), + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), + FlatUsed(false), + VCCUsed(false), + CodeLen(0) {} + + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + uint32_t ScratchSize; + + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; + bool FlatUsed; + + // Bonus information for debugging. + bool VCCUsed; + uint64_t CodeLen; + }; + + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void findNumUsedRegistersSI(const MachineFunction &MF, + unsigned &NumSGPR, + unsigned &NumVGPR) const; + + /// \brief Emit register usage information so that the GPU driver + /// can correctly setup the GPU state. + void EmitProgramInfoR600(const MachineFunction &MF); + void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const; + +public: + explicit AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "AMDGPU Assembly Printer"; + } + + /// Implemented in AMDGPUMCInstLower.cpp + void EmitInstruction(const MachineInstr *MI) override; + + void EmitEndOfAsmFile(Module &M) override; + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + +protected: + std::vector<std::string> DisasmLines, HexLines; + size_t DisasmLineMaxLen; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td new file mode 100644 index 0000000..6ffa7a0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -0,0 +1,82 @@ +//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMD Radeon GPUs. +// +//===----------------------------------------------------------------------===// + +// Inversion of CCIfInReg +class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} + +// Calling convention for SI +def CC_SI : CallingConv<[ + + CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21 + ]>>>, + + CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + >>>, + + CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 + ]>>>, + + CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + >>> + +]>; + +// Calling convention for R600 +def CC_R600 : CallingConv<[ + CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ + T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW, + T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW, + T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW, + T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW, + T30_XYZW, T31_XYZW, T32_XYZW + ]>>> +]>; + +// Calling convention for compute kernels +def CC_AMDGPU_Kernel : CallingConv<[ + CCCustom<"allocateStack"> +]>; + +def CC_AMDGPU : CallingConv<[ + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >=" + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_SI>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_R600>> +]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp new file mode 100644 index 0000000..8175786 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -0,0 +1,112 @@ +//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { } + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { } + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Start the offset at 2 so we don't overwrite work group information. + // XXX: We should only do this when the shader actually uses this + // information. + unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); + OffsetBytes += MFI->getObjectSize(i); + // Each register holds 4 bytes, so we must always align the offset to at + // least 4 bytes, so that 2 frame objects won't share the same register. + OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); + } + + if (FI != -1) + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); + + return OffsetBytes / (getStackWidth(MF) * 4); +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return nullptr; +} +void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const {} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} + +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h new file mode 100644 index 0000000..9f31be1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -0,0 +1,45 @@ +//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on a AMDIL target +/// machine. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +/// \brief Information about the stack frame layout on the AMDGPU targets. +/// +/// It holds the direction of the stack growth, the known stack alignment on +/// entry to each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. +class AMDGPUFrameLowering : public TargetFrameLowering { +public: + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1); + virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + bool hasFP(const MachineFunction &MF) const override; +}; +} // namespace llvm +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp new file mode 100644 index 0000000..df4461e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -0,0 +1,1371 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Defines an instruction selector for the AMDGPU target. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "SIDefines.h" +#include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// AMDGPU specific code to select AMDGPU machine instructions for +/// SelectionDAG operations. +class AMDGPUDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDGPUSubtarget *Subtarget; +public: + AMDGPUDAGToDAGISel(TargetMachine &TM); + virtual ~AMDGPUDAGToDAGISel(); + bool runOnMachineFunction(MachineFunction &MF) override; + SDNode *Select(SDNode *N) override; + const char *getPassName() const override; + void PostprocessISelDAG() override; + +private: + bool isInlineImmediate(SDNode *N) const; + bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, + const R600InstrInfo *TII); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); + bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); + + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + + static bool checkType(const Value *ptr, unsigned int addrspace); + static bool checkPrivateAddress(const MachineMemOperand *Op); + + static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + bool isCPLoad(const LoadSDNode *N) const; + bool isConstantLoad(const LoadSDNode *N, int cbID) const; + bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; + bool isParamLoad(const LoadSDNode *N) const; + bool isPrivateLoad(const LoadSDNode *N) const; + bool isLocalLoad(const LoadSDNode *N) const; + bool isRegionLoad(const LoadSDNode *N) const; + + SDNode *glueCopyToM0(SDNode *N) const; + + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset, + SDValue &SLC) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset, SDValue &GLC) const; + SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Omod) const; + bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); + + SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width); + SDNode *SelectS_BFEFromShifts(SDNode *N); + SDNode *SelectS_BFE(SDNode *N); + + // Include the pieces autogenerated from the target description. +#include "AMDGPUGenDAGISel.inc" +}; +} // end anonymous namespace + +/// \brief This pass converts a legalized DAG into a AMDGPU-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { + return new AMDGPUDAGToDAGISel(TM); +} + +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) + : SelectionDAGISel(TM) {} + +bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget()); + return SelectionDAGISel::runOnMachineFunction(MF); +} + +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { +} + +bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { + const SITargetLowering *TL + = static_cast<const SITargetLowering *>(getTargetLowering()); + return TL->analyzeImmediate(N) == 0; +} + +/// \brief Determine the register class for \p OpNo +/// \returns The register class of the virtual register that will be used for +/// the given operand number \OpNo or NULL if the register class cannot be +/// determined. +const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, + unsigned OpNo) const { + if (!N->isMachineOpcode()) + return nullptr; + + switch (N->getMachineOpcode()) { + default: { + const MCInstrDesc &Desc = + Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + unsigned OpIdx = Desc.getNumDefs() + OpNo; + if (OpIdx >= Desc.getNumOperands()) + return nullptr; + int RegClass = Desc.OpInfo[OpIdx].RegClass; + if (RegClass == -1) + return nullptr; + + return Subtarget->getRegisterInfo()->getRegClass(RegClass); + } + case AMDGPU::REG_SEQUENCE: { + unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + const TargetRegisterClass *SuperRC = + Subtarget->getRegisterInfo()->getRegClass(RCID); + + SDValue SubRegOp = N->getOperand(OpNo + 1); + unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); + return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, + SubRegIdx); + } + } +} + +bool AMDGPUDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + return true; +} + +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(), + AMDGPUAS::LOCAL_ADDRESS)) + return N; + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + // Write max value to m0 before each load operation + + SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), + CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + + SDValue Glue = M0.getValue(1); + + SmallVector <SDValue, 8> Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + Ops.push_back(Glue); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); + + return N; +} + +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + if (isa<AtomicSDNode>(N)) + N = glueCopyToM0(N); + + switch (Opc) { + default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: + case ISD::SUB: { + if (N->getValueType(0) != MVT::i64 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectADD_SUB_I64(N); + } + case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: + case ISD::BUILD_VECTOR: { + unsigned RegClassID; + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + bool UseVReg = true; + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + if (!U->isMachineOpcode()) { + continue; + } + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (!RC) { + continue; + } + if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { + UseVReg = false; + } + } + switch(NumVectorElts) { + case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : + AMDGPU::SReg_32RegClassID; + break; + case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : + AMDGPU::SReg_64RegClassID; + break; + case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : + AMDGPU::SReg_128RegClassID; + break; + case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : + AMDGPU::SReg_256RegClassID; + break; + case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : + AMDGPU::SReg_512RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + } else { + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + } + + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + + if (NumVectorElts == 1) { + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, + N->getOperand(0), RegClass); + } + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + bool IsRegSeq = true; + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { + // XXX: Why is this here? + if (isa<RegisterSDNode>(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); + } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + DL, EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + } + } + + if (!IsRegSeq) + break; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), + RegSeqArgs); + } + case ISD::BUILD_PAIR: { + SDValue RC, SubReg0, SubReg1; + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + break; + } + SDLoc DL(N); + if (N->getValueType(0) == MVT::i128) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); + } else if (N->getValueType(0) == MVT::i64) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + } else { + llvm_unreachable("Unhandled value type for BUILD_PAIR"); + } + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, + N->getOperand(1), SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + case ISD::Constant: + case ISD::ConstantFP: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + break; + + uint64_t Imm; + if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) + Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); + else { + ConstantSDNode *C = cast<ConstantSDNode>(N); + Imm = C->getZExtValue(); + } + + SDLoc DL(N); + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, + MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops); + } + + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + SDLoc SL(N); + EVT VT = N->getValueType(0); + + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { + N = glueCopyToM0(N); + break; + } + + // To simplify the TableGen patters, we replace all i64 loads with + // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 + // during DAG legalization, however, so places (ExpandUnalignedLoad) + // in the DAG legalizer assume that if i64 is legal, so doing this + // promotion early can cause problems. + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); + SDNode *Load = glueCopyToM0(NewLoad.getNode()); + SelectCode(Load); + N = BitCast.getNode(); + break; + } + + case ISD::STORE: { + // Handle i64 stores here for the same reason mentioned above for loads. + StoreSDNode *ST = cast<StoreSDNode>(N); + SDValue Value = ST->getValue(); + if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { + + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } + + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); + } + + N = glueCopyToM0(N); + break; + } + + case AMDGPUISD::REGISTER_LOAD: { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + break; + SDValue Addr, Offset; + + SDLoc DL(N); + SelectADDRIndirect(N->getOperand(1), Addr, Offset); + const SDValue Ops[] = { + Addr, + Offset, + CurDAG->getTargetConstant(0, DL, MVT::i32), + N->getOperand(0), + }; + return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, + CurDAG->getVTList(MVT::i32, MVT::i64, + MVT::Other), + Ops); + } + case AMDGPUISD::REGISTER_STORE: { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + break; + SDValue Addr, Offset; + SelectADDRIndirect(N->getOperand(2), Addr, Offset); + SDLoc DL(N); + const SDValue Ops[] = { + N->getOperand(1), + Addr, + Offset, + CurDAG->getTargetConstant(0, DL, MVT::i32), + N->getOperand(0), + }; + return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, + CurDAG->getVTList(MVT::Other), + Ops); + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + // There is a scalar version available, but unlike the vector version which + // has a separate operand for the offset and width, the scalar version packs + // the width and offset into a single operand. Try to move to the scalar + // version if the offsets are constant, so that we can try to keep extended + // loads of kernel arguments in SGPRs. + + // TODO: Technically we could try to pattern match scalar bitshifts of + // dynamic values, but it's probably not useful. + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + bool Signed = Opc == AMDGPUISD::BFE_I32; + + uint32_t OffsetVal = Offset->getZExtValue(); + uint32_t WidthVal = Width->getZExtValue(); + + return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), + N->getOperand(0), OffsetVal, WidthVal); + + } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } + case ISD::CopyToReg: { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + Lowering.legalizeTargetIndependentNode(N, *CurDAG); + break; + } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); + case ISD::AND: + case ISD::SRL: + case ISD::SRA: + if (N->getValueType(0) != MVT::i32 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectS_BFE(N); + } + + return SelectCode(N); +} + + +bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { + assert(AS != 0 && "Use checkPrivateAddress instead."); + if (!Ptr) + return false; + + return Ptr->getType()->getPointerAddressSpace() == AS; +} + +bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { + if (Op->getPseudoValue()) + return true; + + if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType())) + return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + return false; +} + +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + const Value *MemVal = N->getMemOperand()->getValue(); + return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); +} + +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { + const Value *MemVal = N->getMemOperand()->getValue(); + if (CbId == -1) + return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + + return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); +} + +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { + if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getMemoryVT().bitsLT(MVT::i32)) + return true; + + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { + MachineMemOperand *MMO = N->getMemOperand(); + if (checkPrivateAddress(N->getMemOperand())) { + if (MMO) { + const PseudoSourceValue *PSV = MMO->getPseudoValue(); + if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { + if (checkPrivateAddress(N->getMemOperand())) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + + const Value *MemVal = N->getMemOperand()->getValue(); + if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && + !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { + return true; + } + return false; +} + +const char *AMDGPUDAGToDAGISel::getPassName() const { + return "AMDGPU DAG->DAG Pattern Instruction Selection"; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP + +//===----------------------------------------------------------------------===// +// Complex Patterns +//===----------------------------------------------------------------------===// + +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!isa<ConstantSDNode>(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + SDLoc(CurDAG->getEntryNode()), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + SDLoc DL(Addr); + + if ((C = dyn_cast<ConstantSDNode>(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } + + return true; +} + +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const { + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + +bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + // (add n0, c0) + Base = N0; + Offset = N1; + return true; + } + } + + SDLoc DL(Addr); + + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + if (isUInt<16>(CAddr->getZExtValue())) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset = Addr; + return true; + } + } + + // default case + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); + return true; +} + +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); +} + +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDLoc DL(Addr); + + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N2; + VAddr = N3; + } else { + + // (add N0, C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = N0; + } + + if (isLegalMUBUFImmOffset(C1)) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return; + } else if (isUInt<32>(C1->getZExtValue())) { + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); + return; + } + } + + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) -> addr64 + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N0; + VAddr = N1; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return; + } + + // default case -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + SDValue Ptr, Offen, Idxen, Addr64; + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + ConstantSDNode *C = cast<ConstantSDNode>(Addr64); + if (C->getSExtValue()) { + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); + SDValue GLC, TFE; + + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + unsigned ScratchOffsetReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, + ScratchOffsetReg, MVT::i32); + SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); + SDValue ScratchRsrcDword0 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + + SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); + SDValue ScratchRsrcDword1 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + + const SDValue RsrcOps[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + ScratchRsrcDword0, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + ScratchRsrcDword1, + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + }; + SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, RsrcOps), 0); + Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); + SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (isLegalMUBUFImmOffset(C1)) { + VAddr = Addr.getOperand(0); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + } + + // (node) + VAddr = Addr; + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &SOffset, SDValue &Offset, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDValue Ptr, VAddr, Offen, Idxen, Addr64; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + if (!cast<ConstantSDNode>(Offen)->getSExtValue() && + !cast<ConstantSDNode>(Idxen)->getSExtValue() && + !cast<ConstantSDNode>(Addr64)->getSExtValue()) { + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | + APInt::getAllOnesValue(32).getZExtValue(); // Size + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset, + SDValue &GLC) const { + SDValue SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} + +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); + SDLoc DL(N); + + assert(Subtarget->hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + // FIXME: This is probably wrong, we should never be defining + // a register class with both VGPRs and SGPRs + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, + MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(0, DL, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width) { + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + uint32_t PackedVal = Offset | (Width << 16); + SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { + // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) + // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) + // Predicate: 0 < b <= c < 32 + + const SDValue &Shl = N->getOperand(0); + ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (B && C) { + uint32_t BVal = B->getZExtValue(); + uint32_t CVal = C->getZExtValue(); + + if (0 < BVal && BVal <= CVal && CVal < 32) { + bool Signed = N->getOpcode() == ISD::SRA; + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal); + } + } + return SelectCode(N); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { + switch (N->getOpcode()) { + case ISD::AND: + if (N->getOperand(0).getOpcode() == ISD::SRL) { + // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" + // Predicate: isMask(mask) + const SDValue &Srl = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue(); + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), + ShiftVal, WidthVal); + } + } + } + break; + case ISD::SRL: + if (N->getOperand(0).getOpcode() == ISD::AND) { + // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" + // Predicate: isMask(mask >> b) + const SDValue &And = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), + ShiftVal, WidthVal); + } + } + } else if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + case ISD::SRA: + if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + } + + return SelectCode(N); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + + unsigned Mods = 0; + + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::FABS) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + SDLoc DL(In); + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Omod) const { + // FIXME: Handle Omod + Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const { + Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + return SelectVOP3Mods(In, Src, SrcMods); +} + +void AMDGPUDAGToDAGISel::PostprocessISelDAG() { + const AMDGPUTargetLowering& Lowering = + *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); + bool IsModified = false; + do { + IsModified = false; + // Go over all selected nodes and try to fold them a bit more + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ++I) { + + SDNode *Node = I; + + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); + if (!MachineNode) + continue; + + SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); + if (ResNode != Node) { + ReplaceUses(Node, ResNode); + IsModified = true; + } + } + CurDAG->RemoveDeadNodes(); + } while (IsModified); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp new file mode 100644 index 0000000..570473d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -0,0 +1,2866 @@ +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is the parent TargetLowering class for hardware code gen +/// targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600MachineFunctionInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +using namespace llvm; + +namespace { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} // namespace + + +static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + + return true; +} + +#include "AMDGPUGenCallingConv.inc" + +// Find a larger type to do a load / store of a vector with. +EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +// Type for a vector that will be loaded to. +EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, 32); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); + + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::STORE, MVT::f32, Promote); + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v8f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + + setOperationAction(ISD::STORE, MVT::v2f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + + // Custom lowering of vector stores is required for local address space + // stores. + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } + + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + + if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + } + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + + // GPU does not have divrem function for signed or unsigned. + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction. + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + // The hardware supports 32-bit ROTR, but not ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + + setOperationAction(ISD::SMIN, MVT::i32, Legal); + setOperationAction(ISD::UMIN, MVT::i32, Legal); + setOperationAction(ISD::SMAX, MVT::i32, Legal); + setOperationAction(ISD::UMAX, MVT::i32, Legal); + + if (!Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + + if (!Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + + static const MVT::SimpleValueType VectorIntTypes[] = { + MVT::v2i32, MVT::v4i32 + }; + + for (MVT VT : VectorIntTypes) { + // Expand the following operations for the current type by default. + setOperationAction(ISD::ADD, VT, Expand); + setOperationAction(ISD::AND, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::OR, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::SUB, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::XOR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + static const MVT::SimpleValueType FloatVectorTypes[] = { + MVT::v2f32, MVT::v4f32 + }; + + for (MVT VT : FloatVectorTypes) { + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FMINNUM, VT, Expand); + setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + setSchedulingPreference(Sched::RegPressure); + setJumpIsExpensive(true); + + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(false); + + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + + // There are no integer divide instructions, and these expand to a pretty + // large sequence of instructions. + setIntDivIsCheap(false); + setPow2SDivIsCheap(false); + setFsqrtIsCheap(true); + + // FIXME: Need to really handle these. + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; +} + +//===----------------------------------------------------------------------===// +// Target Information +//===----------------------------------------------------------------------===// + +MVT AMDGPUTargetLowering::getVectorIdxTy() const { + return MVT::i32; +} + +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + +// The backend supports 32 and 64 bit floating point immediates. +// FIXME: Why are we reporting vectors of FP immediates as legal? +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); +} + +// We don't want to shrink f64 / f32 constants. +bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); +} + +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} + +// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also +// profitable with the expansion for 64-bit since it's generally good to +// speculate things. +// FIXME: These should really have the size as a parameter. +bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { + return true; +} + +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { + return true; +} + +//===---------------------------------------------------------------------===// +// Target Properties +//===---------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const { + return true; +} + +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + const DataLayout *DL = getDataLayout(); + unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); + unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + +//===---------------------------------------------------------------------===// +// TargetLowering Callbacks +//===---------------------------------------------------------------------===// + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + + State.AnalyzeFormalArguments(Ins, CC_AMDGPU); +} + +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +} + +//===---------------------------------------------------------------------===// +// Target specific lowering +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName("<unknown>"); + + if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) + FuncName = G->getSymbol(); + else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); + break; + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FREM: return LowerFREM(Op, DAG); + case ISD::FCEIL: return LowerFCEIL(Op, DAG); + case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); + case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); + case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + } + return Op; +} + +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; + + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; + } + default: + return; + } +} + +// FIXME: This implements accesses to initialized globals in the constant +// address space by copying them to private and accessing that. It does not +// properly handle illegal types or vectors. The private vector loads are not +// scalarized, and the illegal scalars hit an assertion. This technique will not +// work well with large initializers, and this should eventually be +// removed. Initialized globals should be placed into a data section that the +// runtime will load into a buffer before the kernel is executed. Uses of the +// global need to be replaced with a pointer loaded from an implicit kernel +// argument into this buffer holding the copy of the data, which will remove the +// need for any of this. +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout *TD = getDataLayout(); + SDLoc DL(InitPtr); + Type *InitTy = Init->getType(); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CFP->getType())); + } + + if (StructType *ST = dyn_cast<StructType>(InitTy)) { + const StructLayout *SL = TD->getStructLayout(ST); + + EVT PtrVT = InitPtr.getValueType(); + SmallVector<SDValue, 8> Chains; + + for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { + SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(I); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { + EVT PtrVT = InitPtr.getValueType(); + + unsigned NumElements; + if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) + NumElements = AT->getNumElements(); + else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) + NumElements = VT->getNumElements(); + else + llvm_unreachable("Unexpected type"); + + unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); + SmallVector<SDValue, 8> Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(i); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (isa<UndefValue>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); +} + +static bool hasDefinedInitializer(const GlobalValue *GV) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + if (!GVar || !GVar->hasInitializer()) + return false; + + if (isa<UndefValue>(GVar->getInitializer())) + return false; + + return true; +} + +SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, + SDValue Op, + SelectionDAG &DAG) const { + + const DataLayout *TD = getDataLayout(); + GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = G->getGlobal(); + + switch (G->getAddressSpace()) { + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && + "Do not know what to do with an non-zero offset"); + + // TODO: We could emit code to handle the initialization somewhere. + if (hasDefinedInitializer(GV)) + break; + + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } + + return DAG.getConstant(Offset, SDLoc(Op), + getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = TD->getTypeAllocSize(EltType); + unsigned Alignment = TD->getPrefTypeAlignment(EltType); + + MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); + + const GlobalVariable *Var = cast<GlobalVariable>(GV); + if (!Var->hasInitializer()) { + // This has no use, but bugpoint will hit it. + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + + const Constant *Init = Var->getInitializer(); + SmallVector<SDNode*, 8> WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, Ops); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + } + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadInit(Fn, + "initializer for address space"); + DAG.getContext()->diagnose(BadInit); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SmallVector<SDValue, 8> Args; + + for (const SDUse &U : Op->ops()) + DAG.ExtractVectorElements(U.get(), Args); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Args; + unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); + + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + switch (IntrinsicID) { + default: return Op; + case AMDGPUIntrinsic::AMDGPU_abs: + case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. + return LowerIntrinsicIABS(Op, DAG); + case AMDGPUIntrinsic::AMDGPU_lrp: + return LowerIntrinsicLRP(Op, DAG); + + case AMDGPUIntrinsic::AMDGPU_clamp: + case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } else { + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + } + + case Intrinsic::AMDGPU_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imax: + return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umax: + return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imin: + return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umin: + return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umul24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imul24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umad24: + return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_imad24: + return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_brev: + return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + } +} + +///IABS(a) = SMAX(sub(0, a), a) +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Op.getOperand(1)); + + return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); +} + +/// Linear Interpolation +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, + DAG.getConstantFP(1.0f, DL, MVT::f32), + Op.getOperand(1)); + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, + Op.getOperand(3)); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); +} + +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETOEQ: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETUEQ: + case ISD::SETEQ: + case ISD::SETFALSE: + case ISD::SETFALSE2: + case ISD::SETTRUE: + case ISD::SETTRUE2: + case ISD::SETUO: + case ISD::SETO: + break; + case ISD::SETULE: + case ISD::SETULT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: { + // Ordered. Assume ordered for undefined. + + // Only do this after legalization to avoid interfering with other combines + // which might occur. + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETOGE: + case ISD::SETOGT: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETCC_INVALID: + llvm_unreachable("Invalid setcc condcode!"); + } + return SDValue(); +} + +// FIXME: Remove this when combines added to DAGCombiner. +SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETULE: + case ISD::SETULT: { + unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETLE: + case ISD::SETLT: { + unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: { + unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getVectorElementType(); + + EVT LoadVT = Op.getValueType(); + EVT EltVT = LoadVT.getVectorElementType(); + EVT PtrVT = Load->getBasePtr().getValueType(); + + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); + SmallVector<SDValue, 8> Loads; + SmallVector<SDValue, 8> Chains; + + SDLoc SL(Op); + unsigned MemEltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), + DAG.getConstant(i * MemEltSize, SL, PtrVT)); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + SrcValue.getWithOffset(i * MemEltSize), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); + } + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorLoad(Op, DAG); + + LoadSDNode *Load = cast<LoadSDNode>(Op); + SDValue BasePtr = Load->getBasePtr(); + EVT PtrVT = BasePtr.getValueType(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + SDValue LoLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, + SrcValue, + LoMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); + + SDValue HiLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, + Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1)) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + SDValue Value = Store->getValue(); + EVT VT = Value.getValueType(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); + + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, DL, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); + } + } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); + EVT EltVT = Store->getValue().getValueType().getVectorElementType(); + EVT PtrVT = Store->getBasePtr().getValueType(); + unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); + SDLoc SL(Op); + + SmallVector<SDValue, 8> Chains; + + unsigned EltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Store->getValue(), + DAG.getConstant(i, SL, MVT::i32)); + + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); + SDValue NewStore = + DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + SrcValue.getWithOffset(i * EltSize), + MemEltVT, Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + Chains.push_back(NewStore); + } + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); +} + +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorStore(Op, DAG); + + EVT MemVT = Store->getMemoryVT(); + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + SDLoc SL(Op); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + + EVT PtrVT = BasePtr.getValueType(); + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); + + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + SDValue LoStore + = DAG.getTruncStore(Chain, SL, Lo, + BasePtr, + SrcValue, + LoMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + SDValue HiStore + = DAG.getTruncStore(Chain, SL, Hi, + HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); +} + + +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || + Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, + // register (2-)byte extract. + + // Get Register holding the target. + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Chain = Store->getChain(); + if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Store->getValue().getValueType().isVector()) { + return ScalarizeVectorStore(Op, DAG); + } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + } + return SDValue(); +} + +// This is a shortcut for integer division because we have fast i32<->f32 +// conversions, and fast f32 reciprocal instructions. The fractional part of a +// float is enough to accurately represent up to a 24-bit integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT IntVT = MVT::i32; + MVT FltVT = MVT::f32; + + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + + if (VT.isVector()) { + unsigned NElts = VT.getVectorNumElements(); + IntVT = MVT::getVectorVT(MVT::i32, NElts); + FltVT = MVT::getVectorVT(MVT::f32, NElts); + } + + unsigned BitSize = VT.getScalarType().getSizeInBits(); + + SDValue jq = DAG.getConstant(1, DL, IntVT); + + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, + DAG.getConstant(BitSize - 2, DL, VT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } + + // int ia = (int)LHS; + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); + + // int ib, (int)RHS; + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, + DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); + + // int cv = fr >= fb; + SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); + + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); + + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); + + // dst = iq + jq; + SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); + + // Rem needs compensation, it's easier to recompute it + SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); + Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const { + assert(Op.getValueType() == MVT::i64); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, DL, HalfVT); + SDValue zero = DAG.getConstant(0, DL, HalfVT); + + //HiLo split + SDValue LHS = Op.getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = Op.getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + const unsigned bitPos = halfBitWidth - i - 1; + SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); + // Get value of high bit + SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); + + // Shift + REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); + // Add LHS high bit + REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); + + SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); + } + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); +} + +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::i64) { + SmallVector<SDValue, 2> Results; + LowerUDIVREM64(Op, DAG, Results); + return DAG.getMergeValues(Results, DL); + } + + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); + + // RCP_LO = mul(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); + + // RCP_HI = mulhu (RCP, Den) */ + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + NEG_RCP_LO, RCP_LO, + ISD::SETEQ); + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); + + // RCP_S_E = RCP - E + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + RCP_A_E, RCP_S_E, + ISD::SETEQ); + // Quotient = mulhu(Tmp0, Num) + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); + + // Remainder = Num - Num_S_Remainder + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, + Num_S_Remainder, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, + Remainder_GE_Zero); + + // Calculate Division result: + + // Quotient_A_One = Quotient + 1 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Quotient_S_One = Quotient - 1 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Quotient, Quotient_A_One, ISD::SETEQ); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Quotient_S_One, Div, ISD::SETEQ); + + // Calculate Rem result: + + // Remainder_S_Den = Remainder - Den + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); + + // Remainder_A_Den = Remainder + Den + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Remainder, Remainder_S_Den, ISD::SETEQ); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Remainder_A_Den, Rem, ISD::SETEQ); + SDValue Ops[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue NegOne = DAG.getConstant(-1, DL, VT); + + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); + } + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); + SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); + + return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); +} + +SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, SL, MVT::i32), + DAG.getConstant(ExpBits, SL, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, SL, MVT::i32)); + + return Exp; +} + +SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const unsigned FractBits = 52; + + // Extract the sign bit. + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); + SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); + + // Extend back to to 64-bits. + SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Zero, SignBit); + SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); + + SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); + + SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); + SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + + SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); + + return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); + SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); + SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); + + APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); + + return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, + MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), SL, + MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, SL, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, SL, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, SL, MVT::f64), + DAG.getConstantFP(0.0, SL, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + +SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src); + // if (src < 0.0 && src != result) + // result += -1.0. + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(0, SL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, + SL, MVT::f64, Hi); + + SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); + + SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + DAG.getConstant(32, SL, MVT::i32)); + + return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); +} + +SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue S0 = Op.getOperand(0); + if (S0.getValueType() != MVT::i64) + return SDValue(); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, false); + + assert(DestVT == MVT::f32); + + SDLoc DL(Op); + + // f32 uint_to_fp i64 + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(0, DL, MVT::i32)); + SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(1, DL, MVT::i32)); + SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, + DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 + return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); +} + +SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + + SDValue Src = Op.getOperand(0); + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, + MVT::f64); + SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, + MVT::f64); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + + + SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + + SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, + MVT::i32, FloorMul); + SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); + + SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, false); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + if (!VT.isVector()) + return SDValue(); + + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector<SDValue, 8> Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +template <typename IntTy> +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, + uint32_t Offset, uint32_t Width, SDLoc DL) { + if (Width + Offset < 32) { + uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); + IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); + return DAG.getConstant(Result, DL, MVT::i32); + } + + return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); +} + +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast<StoreSDNode>(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || + !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + return SDValue(); + + LoadSDNode *LoadVal = cast<LoadSDNode>(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: + return performMulCombine(N, DCI); + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + case ISD::SELECT: { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32) + return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + + // TODO: Implement min / max Evergreen instructions. + if (VT == MVT::i32 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } + } + + break; + } + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + assert(!N->getValueType(0).isVector() && + "Vector handling of BFE not implemented"); + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + uint32_t WidthVal = Width->getZExtValue() & 0x1f; + if (WidthVal == 0) + return DAG.getConstant(0, DL, MVT::i32); + + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + SDValue BitsFrom = N->getOperand(0); + uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; + + bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; + + if (OffsetVal == 0) { + // This is already sign / zero extended, so try to fold away extra BFEs. + unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); + + unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); + if (OpSignBits >= SignBits) + return BitsFrom; + + EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); + if (Signed) { + // This is a sign_extend_inreg. Replace it to take advantage of existing + // DAG Combines. If not eliminated, we will match back to BFE during + // selection. + + // TODO: The sext_inreg of extended types ends, although we can could + // handle them in a single BFE. + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, + DAG.getValueType(SmallVT)); + } + + return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); + } + + if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { + if (Signed) { + return constantFoldBFE<int32_t>(DAG, + CVal->getSExtValue(), + OffsetVal, + WidthVal, + DL); + } + + return constantFoldBFE<uint32_t>(DAG, + CVal->getZExtValue(), + OffsetVal, + WidthVal, + DL); + } + + if ((OffsetVal + WidthVal) >= 32) { + SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); + return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, + BitsFrom, ShiftVal); + } + + if (BitsFrom.hasOneUse()) { + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, + KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + } + + break; + } + + case ISD::STORE: + return performStoreCombine(N, DCI); + } + return SDValue(); +} + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +void AMDGPUTargetLowering::getOriginalFunctionArgs( + SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<ISD::InputArg> &OrigIns) const { + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + if (Ins[i].ArgVT == Ins[i].VT) { + OrigIns.push_back(Ins[i]); + continue; + } + + EVT VT; + if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { + // Vector has been split into scalars. + VT = Ins[i].ArgVT.getVectorElementType(); + } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && + Ins[i].ArgVT.getVectorElementType() != + Ins[i].VT.getVectorElementType()) { + // Vector elements have been promoted + VT = Ins[i].ArgVT; + } else { + // Vector has been spilt into smaller vectors. + VT = Ins[i].VT; + } + + ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, + Ins[i].OrigArgIndex, Ins[i].PartOffset); + OrigIns.push_back(Arg); + } +} + +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->isExactlyValue(1.0); + } + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + return C->isAllOnesValue(); + } + return false; +} + +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->getValueAPF().isZero(); + } + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + return C->isNullValue(); + } + return false; +} + +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned VirtualRegister; + if (!MRI.isLiveIn(Reg)) { + VirtualRegister = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VirtualRegister); + } else { + VirtualRegister = MRI.getLiveInVirtReg(Reg); + } + return DAG.getRegister(VirtualRegister, VT); +} + +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; + +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((AMDGPUISD::NodeType)Opcode) { + case AMDGPUISD::FIRST_NUMBER: break; + // AMDIL DAG nodes + NODE_NAME_CASE(CALL); + NODE_NAME_CASE(UMUL); + NODE_NAME_CASE(RET_FLAG); + NODE_NAME_CASE(BRANCH_COND); + + // AMDGPU DAG nodes + NODE_NAME_CASE(DWORDADDR) + NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(CLAMP) + NODE_NAME_CASE(COS_HW) + NODE_NAME_CASE(SIN_HW) + NODE_NAME_CASE(FMAX_LEGACY) + NODE_NAME_CASE(FMIN_LEGACY) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(LDEXP) + NODE_NAME_CASE(FP_CLASS) + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(CARRY) + NODE_NAME_CASE(BORROW) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) + NODE_NAME_CASE(BREV) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MAD_U24) + NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) + NODE_NAME_CASE(LOAD_CONSTANT) + NODE_NAME_CASE(LOAD_INPUT) + NODE_NAME_CASE(SAMPLE) + NODE_NAME_CASE(SAMPLEB) + NODE_NAME_CASE(SAMPLED) + NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F32_UBYTE0) + NODE_NAME_CASE(CVT_F32_UBYTE1) + NODE_NAME_CASE(CVT_F32_UBYTE2) + NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + NODE_NAME_CASE(CONST_DATA_PTR) + case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(INTERP_MOV) + NODE_NAME_CASE(INTERP_P1) + NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; + } + return nullptr; +} + +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rsq instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +static void computeKnownBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); + DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + + APInt KnownZero2; + APInt KnownOne2; + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + default: + break; + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: { + KnownZero = APInt::getHighBitsSet(32, 31); + break; + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CWidth) + return; + + unsigned BitWidth = 32; + uint32_t Width = CWidth->getZExtValue() & 0x1f; + + if (Opc == AMDGPUISD::BFE_U32) + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + + break; + } + } +} + +unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &DAG, + unsigned Depth) const { + switch (Op.getOpcode()) { + case AMDGPUISD::BFE_I32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!Width) + return 1; + + unsigned SignBits = 32 - Width->getZExtValue() + 1; + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Offset || !Offset->isNullValue()) + return SignBits; + + // TODO: Could probably figure something out with non-0 offsets. + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + return std::max(SignBits, Op0SignBits); + } + + case AMDGPUISD::BFE_U32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; + } + + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: + return 31; + + default: + return 1; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h new file mode 100644 index 0000000..fbb7d3c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -0,0 +1,307 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition of the TargetLowering class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUMachineFunction; +class AMDGPUSubtarget; +class MachineRegisterInfo; + +class AMDGPUTargetLowering : public TargetLowering { +protected: + const AMDGPUSubtarget *Subtarget; + +private: + SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + /// \brief Split a vector store into multiple scalar stores. + /// \returns The resulting chain. + + SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + +protected: + static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); + static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + + virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; + + /// \brief Split a vector load into a scalar load of each component. + SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into 2 loads of half the vector. + SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into a scalar store of each component. + SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into 2 stores of half the vector. + SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const; + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + /// The SelectionDAGBuilder will automatically promote function arguments + /// with illegal types. However, this does not work for the AMDGPU targets + /// since the function arguments are stored in memory as these illegal types. + /// In order to handle this properly we need to get the origianl types sizes + /// from the LLVM IR Function and fixup the ISD:InputArg values before + /// passing them to AnalyzeFormalArguments() + void getOriginalFunctionArgs(SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<ISD::InputArg> &OrigIns) const; + void AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + + bool isFAbsFree(EVT VT) const override; + bool isFNegFree(EVT VT) const override; + bool isTruncateFree(EVT Src, EVT Dest) const override; + bool isTruncateFree(Type *Src, Type *Dest) const override; + + bool isZExtFree(Type *Src, Type *Dest) const override; + bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + + MVT getVectorIdxTy() const override; + bool isSelectSupported(SelectSupportKind) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; + + bool isLoadBitCastBeneficial(EVT, EVT) const override; + + bool storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; + SDValue CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const; + SDValue CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; + + const char* getTargetNodeName(unsigned Opcode) const override; + + SDValue getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + + virtual SDNode *PostISelFolding(MachineSDNode *N, + SelectionDAG &DAG) const { + return N; + } + + /// \brief Determine which of the bits specified in \p Mask are known to be + /// either zero or one and return them in the \p KnownZero and \p KnownOne + /// bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; +}; + +namespace AMDGPUISD { + +enum NodeType : unsigned { + // AMDIL ISD Opcodes + FIRST_NUMBER = ISD::BUILTIN_OP_END, + CALL, // Function call based on a single integer + UMUL, // 32bit unsigned multiplication + RET_FLAG, + BRANCH_COND, + // End AMDIL ISD Opcodes + DWORDADDR, + FRACT, + CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. + COS_HW, + SIN_HW, + FMAX_LEGACY, + FMIN_LEGACY, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, + URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, + LDEXP, + FP_CLASS, + DOT4, + CARRY, + BORROW, + BFE_U32, // Extract range of bits with zero extension to 32-bits. + BFE_I32, // Extract range of bits with sign extension to 32-bits. + BFI, // (src0 & src1) | (~src0 & src2) + BFM, // Insert a range of bits into a 32-bit word. + BREV, // Reverse bits. + MUL_U24, + MUL_I24, + MAD_U24, + MAD_I24, + TEXTURE_FETCH, + EXPORT, + CONST_ADDRESS, + REGISTER_LOAD, + REGISTER_STORE, + LOAD_INPUT, + SAMPLE, + SAMPLEB, + SAMPLED, + SAMPLEL, + + // These cvt_f32_ubyte* nodes need to remain consecutive and in order. + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, + /// Pointer to the start of the shader's constant data. + CONST_DATA_PTR, + SENDMSG, + INTERP_MOV, + INTERP_P1, + INTERP_P2, + FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + STORE_MSKOR, + LOAD_CONSTANT, + TBUFFER_STORE_FORMAT, + LAST_AMDGPU_ISD_NUMBER +}; + + +} // End namespace AMDGPUISD + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp new file mode 100644 index 0000000..15a3d54 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -0,0 +1,369 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implementation of the TargetInstrInfo class that is common to all +/// AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRINFO_NAMED_OPS +#define GET_INSTRMAP_INFO +#include "AMDGPUGenInstrInfo.inc" + +// Pin the vtable to this file. +void AMDGPUInstrInfo::anchor() {} + +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUGenInstrInfo(-1, -1), ST(st) {} + +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { + return RI; +} + +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} + +MachineInstr * +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return nullptr; +} + +void +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +void +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { + MachineBasicBlock *MBB = MI->getParent(); + int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::addr); + // addr is a custom operand with multiple MI operands, and only the + // first MI operand is given a name. + int RegOpIdx = OffsetOpIdx + 1; + int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::chan); + if (isRegisterLoad(*MI)) { + int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::dst); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + getIndirectAddrRegClass()->getRegister(Address)); + } else { + buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + Address, OffsetReg); + } + } else if (isRegisterStore(*MI)) { + int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::val); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), + MI->getOperand(ValOpIdx).getReg()); + } else { + buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), + calculateIndirectAddress(RegIndex, Channel), + OffsetReg); + } + } else { + return false; + } + + MBB->erase(MI); + return true; +} + +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { +// TODO: Implement this function + return nullptr; +} +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { + // TODO: Implement this function + return nullptr; +} +bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef<unsigned> Ops) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + +bool +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} + +int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = -1; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + unsigned Reg = LI->first; + if (TargetRegisterInfo::isVirtualRegister(Reg) || + !IndirectRC->contains(Reg)) + continue; + + unsigned RegIndex; + unsigned RegEnd; + for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; + ++RegIndex) { + if (IndirectRC->getRegister(RegIndex) == Reg) + break; + } + Offset = std::max(Offset, (int)RegIndex); + } + + return Offset + 1; +} + +int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + if (MFI->getNumObjects() == 0) { + return -1; + } + + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + + return getIndirectIndexBegin(MF) + Offset; +} + +int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { + switch (Channels) { + default: return Opcode; + case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); + case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); + case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); + } +} + +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +namespace llvm { +namespace AMDGPU { +static int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); +} +} +} + +// This must be kept in sync with the SISubtarget class in SIInstrInfo.td +enum SISubtarget { + SI = 0, + VI = 1 +}; + +static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { + switch (Gen) { + default: + return SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return VI; + } +} + +int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { + int MCOp = AMDGPU::getMCOpcode( + Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h new file mode 100644 index 0000000..31ae9a3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -0,0 +1,206 @@ +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Contains the definition of a TargetInstrInfo class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H + +#include "AMDGPURegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include <map> + +#define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE + +namespace llvm { + +class AMDGPUSubtarget; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; + +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +private: + const AMDGPURegisterInfo RI; + virtual void anchor(); +protected: + const AMDGPUSubtarget &ST; +public: + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const override; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const override; + + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + MachineInstr *LoadMI) const override; + +public: + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexBegin(const MachineFunction &MF) const; + + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexEnd(const MachineFunction &MF) const; + + bool canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef<unsigned> Ops) const override; + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr *> &NewMIs) const override; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode *> &NewNodes) const override; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; + + bool enableClusterLoads() const override; + + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + bool isPredicated(const MachineInstr *MI) const override; + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + bool isPredicable(MachineInstr *MI) const override; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + // Helper functions that check the opcode for status information + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; + + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + +//===---------------------------------------------------------------------===// +// Pure virtual funtions to be implemented by sub-classes. +//===---------------------------------------------------------------------===// + + virtual bool isMov(unsigned opcode) const = 0; + + /// \brief Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const = 0; + + /// \returns The register class to be used for loading and storing values + /// from an "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; + + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build a MOV instruction. + virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const = 0; + + /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the + /// equivalent opcode that writes \p Channels Channels. + int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; + +}; + +namespace AMDGPU { + int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); +} // End namespace AMDGPU + +} // namespace llvm + +#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) +#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td new file mode 100644 index 0000000..b413897 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -0,0 +1,245 @@ +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains DAG node defintions for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Profiles +//===----------------------------------------------------------------------===// + +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; + +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPULdExpOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUFPClassOp : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + +// float, float, float, vcc +def AMDGPUFmasOp : SDTypeProfile<1, 4, + [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] +>; + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Nodes +// + +// This argument to this node is a dword address. +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; + +def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + +// out = a - floor(a) +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; + +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + +def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; + +def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; + +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [] +>; + +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; + +// out = max(a, b) a and b are signed ints +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are unsigned ints +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [] +>; + +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 +def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; + +// out = (src1 > src0) ? 1 : 0 +def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; + + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", + SDTIntToFPOp, []>; + + +// urecip - This operation is a helper for integer division, it returns the +// result of 1 / a as a fractional unsigned integer. +// out = (2^32 / a) + e +// e is rounding error +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; + +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + +def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + +// MSKOR instructions are atomic memory instructions used mainly for storing +// 8-bit and 16-bit values. The definition is: +// +// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) +// +// src0: vec4(src, 0, 0, mask) +// src1: dst - rat offset (aka pointer) in dwords +def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", + SDTypeProfile<0, 2, []>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def AMDGPUround : SDNode<"ISD::FROUND", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; + +def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; +def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; + +def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; + +// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when +// performing the mulitply. The result is a 32-bit value. +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, + [SDNPCommutative] +>; +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, + [SDNPCommutative] +>; + +def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, + [] +>; +def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", + SDTypeProfile<1, 4, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td new file mode 100644 index 0000000..72cab39 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -0,0 +1,682 @@ +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction defs that are common to all hw codegen +// targets. +// +//===----------------------------------------------------------------------===// + +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction { + field bit isRegisterLoad = 0; + field bit isRegisterStore = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = NullALU; + + let TSFlags{63} = isRegisterLoad; + let TSFlags{62} = isRegisterStore; +} + +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> + : AMDGPUInst<outs, ins, asm, pattern> { + + field bits<32> Inst = 0xffffffff; + +} + +def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; + +def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; +def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; + +let OperandType = "OPERAND_IMMEDIATE" in { + +def u32imm : Operand<i32> { + let PrintMethod = "printU32ImmOperand"; +} + +def u16imm : Operand<i16> { + let PrintMethod = "printU16ImmOperand"; +} + +def u8imm : Operand<i8> { + let PrintMethod = "printU8ImmOperand"; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand<OtherVT>; + +//===----------------------------------------------------------------------===// +// PatLeafs for floating-point comparisons +//===----------------------------------------------------------------------===// + +def COND_OEQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] +>; + +def COND_ONE : PatLeaf < + (cond), + [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] +>; + +def COND_OGT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] +>; + +def COND_OGE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] +>; + +def COND_OLT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] +>; + +def COND_OLE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] +>; + + +def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; +def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for unsigned / unordered comparisons +//===----------------------------------------------------------------------===// + +def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; +def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; +def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; +def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; +def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; +def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; + +// XXX - For some reason R600 version is preferring to use unordered +// for setne? +def COND_UNE_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] +>; + +//===----------------------------------------------------------------------===// +// PatLeafs for signed comparisons +//===----------------------------------------------------------------------===// + +def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; +def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; +def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; +def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for integer equality +//===----------------------------------------------------------------------===// + +def COND_EQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] +>; + +def COND_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] +>; + +def COND_NULL : PatLeaf < + (cond), + [{(void)N; return false;}] +>; + +//===----------------------------------------------------------------------===// +// Load/Store Pattern Fragments +//===----------------------------------------------------------------------===// + +class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad <SDPatternOperator op> : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore <SDPatternOperator op> : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def load_private : PrivateLoad <load>; + +def truncstorei8_private : PrivateStore <truncstorei8>; +def truncstorei16_private : PrivateStore <truncstorei16>; +def store_private : PrivateStore <store>; + +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ + LoadSDNode *L = cast<LoadSDNode>(N); + return L->getExtensionType() == ISD::ZEXTLOAD || + L->getExtensionType() == ISD::EXTLOAD; +}]>; + +def az_extload : AZExtLoadBase <unindexedload>; + +def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def extloadi8_private : PrivateLoad <az_extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; + +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def extloadi16_private : PrivateLoad <az_extloadi16>; +def sextloadi16_private : PrivateLoad <sextloadi16>; + +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def az_extloadi32_global : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def az_extloadi32_constant : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def local_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; + +class local_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + + +def atomic_swap_local : local_binary_atomic_op<atomic_swap>; +def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; +def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; +def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; +def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; +def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; +def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; +def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; +def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; +def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; +def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; + +def mskor_global : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + +multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { + + def _32_local : PatFrag < + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; + + def _64_local : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; +} + +defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; + +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + +class global_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] +>; + +def atomic_swap_global : global_binary_atomic_op<atomic_swap>; +def atomic_add_global : global_binary_atomic_op<atomic_load_add>; +def atomic_and_global : global_binary_atomic_op<atomic_load_and>; +def atomic_max_global : global_binary_atomic_op<atomic_load_max>; +def atomic_min_global : global_binary_atomic_op<atomic_load_min>; +def atomic_or_global : global_binary_atomic_op<atomic_load_or>; +def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; +def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; +def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; +def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; + +//===----------------------------------------------------------------------===// +// Misc Pattern Fragments +//===----------------------------------------------------------------------===// + +class Constants { +int TWO_PI = 0x40c90fdb; +int PI = 0x40490fdb; +int TWO_PI_INV = 0x3e22f983; +int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP32_NEG_ONE = 0xbf800000; +int FP32_ONE = 0x3f800000; +} +def CONST : Constants; + +def FP_ZERO : PatLeaf < + (fpimm), + [{return N->getValueAPF().isZero();}] +>; + +def FP_ONE : PatLeaf < + (fpimm), + [{return N->isExactlyValue(1.0);}] +>; + +def FP_HALF : PatLeaf < + (fpimm), + [{return N->isExactlyValue(0.5);}] +>; + +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { + +class CLAMP <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] +>; + +class FABS <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set f32:$dst, (fabs f32:$src0))] +>; + +class FNEG <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set f32:$dst, (fneg f32:$src0))] +>; + +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, + ComplexPattern addrPat> { +let UseNamedOperandTable = 1 in { + + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} +} + +} // End isCodeGenOnly = 1, isPseudo = 1 + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> + : Pat < + (fpow f32:$src0, f32:$src1), + (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) +>; + +/* Other helper patterns */ +/* --------------------- */ + +/* Extract element pattern */ +class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, + SubRegIndex sub_reg> + : Pat< + (sub_type (vector_extract vec_type:$src, sub_idx)), + (EXTRACT_SUBREG $src, sub_reg) +>; + +/* Insert element pattern */ +class Insert_Element <ValueType elem_type, ValueType vec_type, + int sub_idx, SubRegIndex sub_reg> + : Pat < + (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (INSERT_SUBREG $vec, $elem, sub_reg) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +// bitconvert pattern +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < + (dt (bitconvert (st rc:$src0))), + (dt rc:$src0) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < + (vt (AMDGPUdwordaddr (vt rc:$addr))), + (vt rc:$addr) +>; + +// BFI_INT patterns + +multiclass BFIPatterns <Instruction BFI_INT, + Instruction LoadImm32, + RegisterClass RC64> { + // Definition from ISA doc: + // (y & x) | (z & ~x) + def : Pat < + (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (BFI_INT $x, $y, $z) + >; + + // SHA-256 Ch function + // z ^ (x & (y ^ z)) + def : Pat < + (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (BFI_INT $x, $y, $z) + >; + + def : Pat < + (fcopysign f32:$src0, f32:$src1), + (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + >; + + def : Pat < + (f64 (fcopysign f64:$src0, f64:$src1)), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (BFI_INT (LoadImm32 0x7fffffff), + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) + >; +} + +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y +class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) +>; + +// Bitfield extract patterns + +def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ + return isMask_32(N->getZExtValue()); +}]>; + +def IMMPopCount : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +class BFEPattern <Instruction BFE, Instruction MOV> : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) +>; + +// rotr pattern +class ROTRPattern <Instruction BIT_ALIGN> : Pat < + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +>; + +// 24-bit arithmetic patterns +def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; + +// Special conversion patterns + +def cvt_rpi_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor (fadd $src, FP_HALF))), + [{ (void) N; return TM.Options.NoNaNsFPMath; }] +>; + +def cvt_flr_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor $src)), + [{ (void)N; return TM.Options.NoNaNsFPMath; }] +>; + +/* +class UMUL24Pattern <Instruction UMUL24> : Pat < + (mul U24:$x, U24:$y), + (UMUL24 $x, $y) +>; +*/ + +class IMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +class UMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_imad24 : Pat < + (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_imul24 : Pat < + (AMDGPUmul_i24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_umad24 : Pat < + (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_umul24 : Pat < + (AMDGPUmul_u24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +class RcpPat<Instruction RcpInst, ValueType vt> : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat<Instruction RsqInst, ValueType vt> : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + +include "R600Instructions.td" +include "R700Instructions.td" +include "EvergreenInstructions.td" +include "CaymanInstructions.td" + +include "SIInstrInfo.td" + diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp new file mode 100644 index 0000000..e94bb60 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -0,0 +1,77 @@ +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDGPUGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() + : TargetIntrinsicInfo() {} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + if (IntrID < Intrinsic::num_intrinsics) { + return nullptr; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { + if (!StringRef(Name, Len).startswith("llvm.")) + return 0; // All intrinsics start with 'llvm.' + +#define GET_FUNCTION_RECOGNIZER +#include "AMDGPUGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); + + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { + llvm_unreachable("Not implemented"); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h new file mode 100644 index 0000000..4c95b5e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -0,0 +1,48 @@ +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace AMDGPUIntrinsic { +enum ID { + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDGPU_intrinsics +}; + +} // end namespace AMDGPUIntrinsic + +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +public: + AMDGPUIntrinsicInfo(); + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, + Type **Tys = nullptr, + unsigned numTys = 0) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td new file mode 100644 index 0000000..ab489cd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -0,0 +1,90 @@ +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines intrinsics that are used by all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; +} + +// Legacy names for compatibility. +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; +} + +include "SIIntrinsics.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp new file mode 100644 index 0000000..2083146 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -0,0 +1,154 @@ +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUMCInstLower.h" +#include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "R600InstrInfo.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include <algorithm> + +using namespace llvm; + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): + Ctx(ctx), ST(st) +{ } + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } + + OutMI.setOpcode(MCOpcode); + + for (const MachineOperand &MO : MI->explicit_operands()) { + MCOperand MCOp; + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_Register: + MCOp = MCOperand::createReg(MO.getReg()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( + MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); + break; + } + case MachineOperand::MO_TargetIndex: { + assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + break; + } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + break; + } + } + OutMI.addOperand(MCOp); + } +} + +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI); + +#ifdef _DEBUG + StringRef Err; + if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { + errs() << "Warning: Illegal instruction detected: " << Err << "\n"; + MI->dump(); + } +#endif + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = MI; + ++I; + while (I != MBB->end() && I->isInsideBundle()) { + EmitInstruction(I); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + + if (STI.dumpCode()) { + // Disassemble instruction/operands to text. + DisasmLines.resize(DisasmLines.size() + 1); + std::string &DisasmLine = DisasmLines.back(); + raw_string_ostream DisasmStream(DisasmLine); + + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), + MF->getSubtarget()); + + // Disassemble instruction/operands to hex representation. + SmallVector<MCFixup, 4> Fixups; + SmallVector<char, 16> CodeBytes; + raw_svector_ostream CodeStream(CodeBytes); + + auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer); + MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); + InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, + MF->getSubtarget<MCSubtargetInfo>()); + CodeStream.flush(); + + HexLines.resize(HexLines.size() + 1); + std::string &HexLine = HexLines.back(); + raw_string_ostream HexStream(HexLine); + + for (size_t i = 0; i < CodeBytes.size(); i += 4) { + unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i]; + HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord); + } + + DisasmStream.flush(); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size()); + } + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h new file mode 100644 index 0000000..d322fe0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -0,0 +1,35 @@ +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H + +namespace llvm { + +class AMDGPUSubtarget; +class MachineInstr; +class MCContext; +class MCInst; + +class AMDGPUMCInstLower { + MCContext &Ctx; + const AMDGPUSubtarget &ST; + +public: + AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); + + /// \brief Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp new file mode 100644 index 0000000..21c7da6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -0,0 +1,25 @@ +#include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +using namespace llvm; + +static const char *const ShaderTypeAttribute = "ShaderType"; + +// Pin the vtable to this file. +void AMDGPUMachineFunction::anchor() {} + +AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0), + ScratchSize(0), + IsKernel(true) { + Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) + llvm_unreachable("Can't parse shader type!"); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h new file mode 100644 index 0000000..e17b41a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -0,0 +1,45 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H + +#include "llvm/CodeGen/MachineFunction.h" +#include <map> + +namespace llvm { + +class AMDGPUMachineFunction : public MachineFunctionInfo { + virtual void anchor(); + unsigned ShaderType; + +public: + AMDGPUMachineFunction(const MachineFunction &MF); + /// A map to keep track of local memory objects and their offsets within + /// the local memory space. + std::map<const GlobalValue *, unsigned> LocalMemoryObjects; + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; + + /// Start of implicit kernel args + unsigned ABIArgOffset; + + unsigned getShaderType() const { + return ShaderType; + } + + unsigned ScratchSize; + bool IsKernel; +}; + +} // namespace llvm +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp new file mode 100644 index 0000000..4a65bfc --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,407 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor<AMDGPUPromoteAlloca> { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Promote Alloca"; } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + const FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + const Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast<Instruction>(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(const Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value * +calculateVectorIndex(Value *Ptr, + const std::map<GetElementPtrInst *, Value *> &GEPIdx) { + if (isa<AllocaInst>(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + + auto I = GEPIdx.find(GEP); + return I == GEPIdx.end() ? nullptr : I->second; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + default: + return false; + } +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + std::vector<Value*> WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); + if (!GEP) { + if (!canVectorizeInst(cast<Instruction>(AllocaUser))) + return false; + + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast<Instruction>(GEPUser))) + return false; + + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); + + for (std::vector<Value*>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast<Instruction>(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + return true; +} + +static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + bool Success = true; + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (isa<CallInst>(User)) { + WorkList.push_back(User); + continue; + } + + // FIXME: Correctly handle ptrtoint instructions. + Instruction *UseInst = dyn_cast<Instruction>(User); + if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + return false; + + if (!User->getType()->isPointerTy()) + continue; + + WorkList.push_back(User); + + Success &= collectUsesWithPtrTypes(User, WorkList); + } + return Success; +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = + WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + std::vector<Value*> WorkList; + + if (!collectUsesWithPtrTypes(&I, WorkList)) { + DEBUG(dbgs() << " Do not know how to convert all uses\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + GlobalVariable *GV = new GlobalVariable( + *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); + Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); + Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector<Value*> Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + for (std::vector<Value*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast<CallInst>(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + // The operand's value should be corrected on its own. + if (isa<AddrSpaceCastInst>(V)) + continue; + + // FIXME: It doesn't really make sense to try to do this for all + // instructions. + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); + if (!Intr) { + std::vector<Type*> ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), + NewType, F->getAttributes()); + Function *NewF = cast<Function>(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast<MemSetInst>(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp new file mode 100644 index 0000000..3ca0eca --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} + +//===----------------------------------------------------------------------===// +// Function handling callbacks - Functions are a seldom used feature of GPUS, so +// they are not supported at this time. +//===----------------------------------------------------------------------===// + +const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg* +AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + return &CalleeSavedReg; +} + +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("Subroutines not supported yet"); +} + +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { + static const unsigned SubRegs[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, + AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, + AMDGPU::sub15 + }; + + assert(Channel < array_lengthof(SubRegs)); + return SubRegs[Channel]; +} + +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + return getSubRegFromChannel(IndirectIndex); +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h new file mode 100644 index 0000000..cfd800b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -0,0 +1,64 @@ +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +namespace llvm { + +class AMDGPUSubtarget; +class TargetInstrInfo; + +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { + static const MCPhysReg CalleeSavedReg; + + AMDGPURegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override { + assert(!"Unimplemented"); return BitVector(); + } + + virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { + assert(!"Unimplemented"); return nullptr; + } + + virtual unsigned getHWRegIndex(unsigned Reg) const { + assert(!"Unimplemented"); return 0; + } + + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) + unsigned getSubRegFromChannel(unsigned Channel) const; + + const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; + + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td new file mode 100644 index 0000000..835a146 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -0,0 +1,26 @@ +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tablegen register definitions common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDGPU" in { + +foreach Index = 0-15 in { + // Indices are used in a variety of ways here, so don't set a size/offset. + def sub#Index : SubRegIndex<-1, -1>; +} + +def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; + +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp new file mode 100644 index 0000000..605ccd0 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -0,0 +1,133 @@ +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-subtarget" + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "AMDGPUGenSubtargetInfo.inc" + +AMDGPUSubtarget & +AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS) { + // Determine default and user-specified characteristics + // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be + // enabled, but some instructions do not respect them and they run at the + // double precision rate, so don't enable by default. + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + FullFS += FS; + + if (GPU == "" && TT.getArch() == Triple::amdgcn) + GPU = "SI"; + + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + FP64Denormals = false; + } + return *this; +} + +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), + DumpCode(false), R600ALUInst(false), HasVertexCache(false), + TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), + FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), + CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), + GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), + FrameLowering(TargetFrameLowering::StackGrowsUp, + 64 * 16, // Maximum stack alignment (long16) + 0), + InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { + + initializeSubtargetDependencies(TT, GPU, FS); + + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + TLInfo.reset(new R600TargetLowering(TM, *this)); + } else { + InstrInfo.reset(new SIInstrInfo(*this)); + TLInfo.reset(new SITargetLowering(TM, *this)); + } +} + +unsigned AMDGPUSubtarget::getStackEntrySize() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + switch(getWavefrontSize()) { + case 16: + return 8; + case 32: + return hasCaymanISA() ? 4 : 8; + case 64: + return 4; + default: + llvm_unreachable("Illegal wavefront size."); + } +} + +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { + switch(getGeneration()) { + default: llvm_unreachable("ChipID unknown"); + case SEA_ISLANDS: return 12; + } +} + +bool AMDGPUSubtarget::isVGPRSpillingEnabled( + const SIMachineFunctionInfo *MFI) const { + return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +} + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h new file mode 100644 index 0000000..0d40d14 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -0,0 +1,282 @@ +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_SUBTARGETINFO_HEADER +#include "AMDGPUGenSubtargetInfo.inc" + +namespace llvm { + +class SIMachineFunctionInfo; + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { + +public: + enum Generation { + R600 = 0, + R700, + EVERGREEN, + NORTHERN_ISLANDS, + SOUTHERN_ISLANDS, + SEA_ISLANDS, + VOLCANIC_ISLANDS, + }; + + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + +private: + std::string DevName; + bool Is64bit; + bool DumpCode; + bool R600ALUInst; + bool HasVertexCache; + short TexVTXClauseSize; + Generation Gen; + bool FP64; + bool FP64Denormals; + bool FP32Denormals; + bool FastFMAF32; + bool CaymanISA; + bool FlatAddressSpace; + bool EnableIRStructurizer; + bool EnablePromoteAlloca; + bool EnableIfCvt; + bool EnableLoadStoreOpt; + unsigned WavefrontSize; + bool CFALUBug; + int LocalMemorySize; + bool EnableVGPRSpilling; + bool SGPRInitBug; + bool IsGCN; + bool GCN1Encoding; + bool GCN3Encoding; + bool CIInsts; + bool FeatureDisable; + int LDSBankCount; + + AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUTargetLowering> TLInfo; + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; + InstrItineraryData InstrItins; + Triple TargetTriple; + +public: + AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + TargetMachine &TM); + AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS); + + const AMDGPUFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AMDGPUInstrInfo *getInstrInfo() const override { + return InstrInfo.get(); + } + const AMDGPURegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + AMDGPUTargetLowering *getTargetLowering() const override { + return TLInfo.get(); + } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFP64Denormals() const { + return FP64Denormals; + } + + bool hasFastFMAF32() const { + return FastFMAF32; + } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool hasBFE() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFM() const { + return hasBFE(); + } + + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + if (Size == 64) + return (getGeneration() >= SOUTHERN_ISLANDS); + + return false; + } + + bool hasMulU24() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMulI24() const { + return (getGeneration() >= SOUTHERN_ISLANDS || + hasCaymanISA()); + } + + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasCARRY() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + + unsigned getStackEntrySize() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } + + bool hasSGPRInitBug() const { + return SGPRInitBug; + } + + int getLDSBankCount() const { + return LDSBankCount; + } + + unsigned getAmdKernelCodeChipID() const; + + bool enableMachineScheduler() const override { + return true; + } + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + + // Helper functions to simplify if statements + bool isTargetELF() const { + return false; + } + + StringRef getDeviceName() const { + return DevName; + } + + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } + bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + llvm_unreachable("do not know max waves per CU for this subtarget."); + } + + bool enableSubRegLiveness() const override { + return true; + } +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp new file mode 100644 index 0000000..a9a911a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -0,0 +1,292 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU target machine contains all of the hardware specific +/// information needed to emit code for R600 and SI GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "AMDGPUTargetTransformInfo.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include <llvm/CodeGen/Passes.h> + +using namespace llvm; + +extern "C" void LLVMInitializeAMDGPUTarget() { + // Register the target + RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); +} + +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); +} + +static MachineSchedRegistry +SchedCustomRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static std::string computeDataLayout(const Triple &TT) { + std::string Ret = "e-p:32:32"; + + if (TT.getArch() == Triple::amdgcn) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OptLevel) + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OptLevel), + TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + IntrinsicInfo() { + setRequiresStructuredCFG(true); + initAsmInfo(); +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() { + delete TLOF; +} + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// AMDGPU Pass Setup +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM<AMDGPUTargetMachine>(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + return createR600MachineScheduler(C); + return nullptr; + } + + void addIRPasses() override; + void addCodeGenPrepare() override; + virtual bool addPreISel() override; + virtual bool addInstSelector() override; +}; + +class R600PassConfig : public AMDGPUPassConfig { +public: + R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + + bool addPreISel() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +class GCNPassConfig : public AMDGPUPassConfig { +public: + GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + bool addPreISel() override; + bool addInstSelector() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +} // End of anonymous namespace + +TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); +} + +void AMDGPUPassConfig::addIRPasses() { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + TargetPassConfig::addIRPasses(); +} + +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.isPromoteAllocaEnabled()) { + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + } + TargetPassConfig::addCodeGenPrepare(); +} + +bool +AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createFlattenCFGPass()); + if (ST.IsIRStructurizerEnabled()) + addPass(createStructurizeCFGPass()); + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} + +//===----------------------------------------------------------------------===// +// R600 Pass Setup +//===----------------------------------------------------------------------===// + +bool R600PassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createR600TextureIntrinsicsReplacer()); + return false; +} + +void R600PassConfig::addPreRegAlloc() { + addPass(createR600VectorRegMerger(*TM)); +} + +void R600PassConfig::addPreSched2() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createR600EmitClauseMarkers(), false); + if (ST.isIfCvtEnabled()) + addPass(&IfConverterID, false); + addPass(createR600ClauseMergePass(*TM), false); +} + +void R600PassConfig::addPreEmitPass() { + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); +} + +TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { + return new R600PassConfig(this, PM); +} + +//===----------------------------------------------------------------------===// +// GCN Pass Setup +//===----------------------------------------------------------------------===// + +bool GCNPassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createSinkingPass()); + addPass(createSITypeRewriter()); + addPass(createSIAnnotateControlFlowPass()); + return false; +} + +bool GCNPassConfig::addInstSelector() { + AMDGPUPassConfig::addInstSelector(); + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(createSIFoldOperandsPass()); + return false; +} + +void GCNPassConfig::addPreRegAlloc() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + + // This needs to be run directly before register allocation because + // earlier passes might recompute live intervals. + // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass + if (getOptLevel() > CodeGenOpt::None) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + } + + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + } + addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIFixSGPRLiveRangesPass(), false); +} + +void GCNPassConfig::addPostRegAlloc() { + addPass(createSIPrepareScratchRegs(), false); + addPass(createSIShrinkInstructionsPass(), false); +} + +void GCNPassConfig::addPreSched2() { + addPass(createSIInsertWaits(*TM), false); +} + +void GCNPassConfig::addPreEmitPass() { + addPass(createSILowerControlFlowPass(*TM), false); +} + +TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { + return new GCNPassConfig(this, PM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h new file mode 100644 index 0000000..14792e3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -0,0 +1,89 @@ +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H + +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/IR/DataLayout.h" + +namespace llvm { + +//===----------------------------------------------------------------------===// +// AMDGPU Target Machine (R600+) +//===----------------------------------------------------------------------===// + +class AMDGPUTargetMachine : public LLVMTargetMachine { +private: + +protected: + TargetLoweringObjectFile *TLOF; + AMDGPUSubtarget Subtarget; + AMDGPUIntrinsicInfo IntrinsicInfo; + +public: + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + + const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } + TargetIRAnalysis getTargetIRAnalysis() override; + + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF; + } +}; + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +class R600TargetMachine : public AMDGPUTargetMachine { + +public: + R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +class GCNTargetMachine : public AMDGPUTargetMachine { + +public: + GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp new file mode 100644 index 0000000..6dacc74 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -0,0 +1,82 @@ +//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file implements a TargetTransformInfo analysis pass specific to the +// AMDGPU target machine. It uses the target's detailed information to provide +// more precise answers to certain TTI queries, while letting the target +// independent and default TTI implementations handle the rest. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "AMDGPUtti" + +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + UP.Threshold = 300; // Twice the default. + UP.MaxCount = UINT_MAX; + UP.Partial = true; + + // TODO: Do we want runtime unrolling? + + for (const BasicBlock *BB : L->getBlocks()) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + for (const Instruction &I : *BB) { + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); + if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + continue; + + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); + if (Alloca) { + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to use a higher than normal loop unroll + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. + UP.Threshold = 800; + } + } + } +} + +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { + if (Vec) + return 0; + + // Number of VGPRs on SI. + if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 256; + + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } + +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Semi-arbitrary large amount. + return 64; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h new file mode 100644 index 0000000..791c84e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -0,0 +1,78 @@ +//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AMDGPU target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { + typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + const AMDGPUSubtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) + : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + bool hasBranchDivergence() { return true; } + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + } + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp new file mode 100644 index 0000000..c9b25a1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -0,0 +1,1912 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include <deque> + +using namespace llvm; + +#define DEBUG_TYPE "structcfg" + +#define DEFAULT_VEC_SLOTS 8 + +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +namespace llvm { + void initializeAMDGPUCFGStructurizerPass(PassRegistry&); +} + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace { +#define SHOWNEWINSTR(i) \ + DEBUG(dbgs() << "New instr: " << *i << "\n"); + +#define SHOWNEWBLK(b, msg) \ +DEBUG( \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + dbgs() << "\n"; \ +); + +#define SHOWBLK_DETAIL(b, msg) \ +DEBUG( \ + if (b) { \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(dbgs()); \ + dbgs() << "\n"; \ + } \ +); + +#define INVALIDSCCNUM -1 + +template<class NodeT> +void ReverseVector(SmallVectorImpl<NodeT *> &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + + +namespace { + +class BlockInformation { +public: + bool IsRetired; + int SccNum; + BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUCFGStructurizer : public MachineFunctionPass { +public: + typedef SmallVector<MachineBasicBlock *, 32> MBBVector; + typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap; + typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap; + + enum PathToKind { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + }; + + static char ID; + + AMDGPUCFGStructurizer() : + MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { + initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AMDGPU Control Flow Graph structurizer Pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + } + + /// Perform the CFG structurization + bool run(); + + /// Perform the CFG preparation + /// This step will remove every unconditionnal/dead jump instructions and make + /// sure all loops have an exit block + bool prepare(); + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = &TII->getRegisterInfo(); + DEBUG(MF.dump();); + OrderedBlks.clear(); + Visited.clear(); + FuncRep = &MF; + MLI = &getAnalysis<MachineLoopInfo>(); + DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); + MDT = &getAnalysis<MachineDominatorTree>(); + DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); + PDT = &getAnalysis<MachinePostDominatorTree>(); + DEBUG(PDT->print(dbgs());); + prepare(); + run(); + DEBUG(MF.dump();); + return true; + } + +protected: + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; + MachineLoopInfo *MLI; + const R600InstrInfo *TII; + const AMDGPURegisterInfo *TRI; + + // PRINT FUNCTIONS + /// Print the ordered Blocks. + void printOrderedBlocks() const { + size_t i = 0; + for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), + iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { + dbgs() << "BB" << (*iterBlk)->getNumber(); + dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + dbgs() << "\n"; + } else { + dbgs() << " "; + } + } + } + static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { + for (MachineLoop::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { + (*iter)->print(dbgs(), 0); + } + } + + // UTILITY FUNCTIONS + int getSCCNum(MachineBasicBlock *MBB) const; + MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; + bool hasBackEdge(MachineBasicBlock *MBB) const; + static unsigned getLoopDepth(MachineLoop *LoopRep); + bool isRetiredBlock(MachineBasicBlock *MBB) const; + bool isActiveLoophead(MachineBasicBlock *MBB) const; + PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry = true) const; + int countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const; + bool needMigrateBlock(MachineBasicBlock *MBB) const; + + // Utility Functions + void reversePredicateSetter(MachineBasicBlock::iterator I); + /// Compute the reversed DFS post order of Blocks + void orderBlocks(MachineFunction *MF); + + // Function originally from CFGStructTraits + void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); + void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, + DebugLoc DL); + void insertCondBranchBefore(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL); + void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); + static int getBranchNzeroOpcode(int OldOpcode); + static int getBranchZeroOpcode(int OldOpcode); + static int getContinueNzeroOpcode(int OldOpcode); + static int getContinueZeroOpcode(int OldOpcode); + static MachineBasicBlock *getTrueBranch(MachineInstr *MI); + static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); + static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI); + static bool isCondBranch(MachineInstr *MI); + static bool isUncondBranch(MachineInstr *MI); + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); + /// The correct naming for this is getPossibleLoopendBlockBranchInstr. + /// + /// BB with backward-edge could have move instructions after the branch + /// instruction. Such move instruction "belong to" the loop backward-edge. + MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); + static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); + static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); + static bool isReturnBlock(MachineBasicBlock *MBB); + static void cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) ; + static MachineBasicBlock *clone(MachineBasicBlock *MBB); + /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose + /// because the AMDGPU instruction is not recognized as terminator fix this + /// and retire this routine + void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, + MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); + static void wrapup(MachineBasicBlock *MBB); + + + int patternMatch(MachineBasicBlock *MBB); + int patternMatchGroup(MachineBasicBlock *MBB); + int serialPatternMatch(MachineBasicBlock *MBB); + int ifPatternMatch(MachineBasicBlock *MBB); + int loopendPatternMatch(); + int mergeLoop(MachineLoop *LoopRep); + int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); + + void handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop); + /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in + /// the same loop with LoopLandInfo without explicitly keeping track of + /// loopContBlks and loopBreakBlks, this is a method to get the information. + bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, + MachineBasicBlock *Src2MBB); + int handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr); + void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock *LandMBB, bool Detail = false); + int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); + void mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB); + + void mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); + void mergeLooplandBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *LandMBB); + void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB); + void settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB); + /// normalizeInfiniteLoopExit change + /// B1: + /// uncond_br LoopHeader + /// + /// to + /// B1: + /// cond_br 1 LoopHeader dummyExit + /// and return the newly added dummy exit block + MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); + void removeUnconditionalBranch(MachineBasicBlock *MBB); + /// Remove duplicate branches instructions in a block. + /// For instance + /// B0: + /// cond_br X B1 B2 + /// cond_br X B1 B2 + /// is transformed to + /// B0: + /// cond_br X B1 B2 + void removeRedundantConditionalBranch(MachineBasicBlock *MBB); + void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB); + void removeSuccessor(MachineBasicBlock *MBB); + MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB); + void migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); + void recordSccnum(MachineBasicBlock *MBB, int SCCNum); + void retireBlock(MachineBasicBlock *MBB); + void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); + + MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); + /// This is work around solution for findNearestCommonDominator not available + /// to post dom a proper fix should go to Dominators.h. + MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2); + +private: + MBBInfoMap BlockInfoMap; + LoopLandInfoMap LLInfoMap; + std::map<MachineLoop *, bool> Visited; + MachineFunction *FuncRep; + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; +}; + +int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return INVALIDSCCNUM; + return (*It).second->SccNum; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) + const { + LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); + if (It == LLInfoMap.end()) + return nullptr; + return (*It).second; +} + +bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + if (!LoopRep) + return false; + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + return MBB->isSuccessor(LoopHeader); +} + +unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { + return LoopRep ? LoopRep->getLoopDepth() : 0; +} + +bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return false; + return (*It).second->IsRetired; +} + +bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + while (LoopRep && LoopRep->getHeader() == MBB) { + MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); + if(!LoopLand) + return true; + if (!isRetiredBlock(LoopLand)) + return true; + LoopRep = LoopRep->getParentLoop(); + } + return false; +} +AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry) const { + assert(DstMBB); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + while (SrcMBB && SrcMBB->succ_size() == 1) { + SrcMBB = *SrcMBB->succ_begin(); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + if (!AllowSideEntry && SrcMBB->pred_size() > 1) + return Not_SinglePath; + } + if (SrcMBB && SrcMBB->succ_size()==0) + return SinglePath_NotInPath; + return Not_SinglePath; +} + +int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const { + int Count = 0; + while (It != E) { + if (!isRetiredBlock(*It)) + ++Count; + ++It; + } + return Count; +} + +bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { + unsigned BlockSizeThreshold = 30; + unsigned CloneInstrThreshold = 100; + bool MultiplePreds = MBB && (MBB->pred_size() > 1); + + if(!MultiplePreds) + return false; + unsigned BlkSize = MBB->size(); + return ((BlkSize > BlockSizeThreshold) && + (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); +} + +void AMDGPUCFGStructurizer::reversePredicateSetter( + MachineBasicBlock::iterator I) { + while (I--) { + if (I->getOpcode() == AMDGPU::PRED_X) { + switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { + case OPCODE_IS_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO_INT); + return; + case OPCODE_IS_NOT_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO_INT); + return; + case OPCODE_IS_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO); + return; + case OPCODE_IS_NOT_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO); + return; + default: + llvm_unreachable("PRED_X Opcode invalid!"); + } + } + } +} + +void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = MBB->getParent() + ->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->push_back(MI); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(MI); +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); + if (MBB->begin() != MBB->end()) + MBB->insert(MBB->begin(), MI); + else + MBB->push_back(MI); + SHOWNEWINSTR(MI); + return MI; +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( + MachineBasicBlock::iterator I, int NewOpcode) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineInstr *NewMBB = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->insert(I, NewMBB); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(NewMBB); + return NewMBB; +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore( + MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->insert(I, NewMI); + MachineInstrBuilder MIB(*MF, NewMI); + MIB.addReg(OldMI->getOperand(1).getReg(), false); + SHOWNEWINSTR(NewMI); + //erase later oldInstr->eraseFromParent(); +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL) { + MachineFunction *MF = blk->getParent(); + MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + //insert before + blk->insert(I, NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, + int NewOpcode, int RegNum) { + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewInstr = + MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->push_back(NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + default: llvm_unreachable("internal error"); + }; + return -1; +} + +int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { + return MI->getOperand(0).getMBB(); +} + +void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, + MachineBasicBlock *MBB) { + MI->getOperand(0).setMBB(MBB); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI) { + assert(MBB->succ_size() == 2); + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + MachineBasicBlock::succ_iterator It = MBB->succ_begin(); + MachineBasicBlock::succ_iterator Next = It; + ++Next; + return (*It == TrueBranch) ? *Next : *It; +} + +bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP_COND: + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return true; + default: + return false; + } + return false; +} + +bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP: + case AMDGPU::BRANCH: + return true; + default: + return false; + } + return false; +} + +DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); + ++It) { + MachineInstr *instr = &(*It); + if (instr->getDebugLoc()) + DL = instr->getDebugLoc(); + } + return DL; +} + +MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( + MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + MachineInstr *MI = &*It; + if (MI && (isCondBranch(MI) || isUncondBranch(MI))) + return MI; + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( + MachineBasicBlock *MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); + It != E; ++It) { + // FIXME: Simplify + MachineInstr *MI = &*It; + if (MI) { + if (isCondBranch(MI) || isUncondBranch(MI)) + return MI; + else if (!TII->isMov(MI->getOpcode())) + break; + } + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *instr = &(*It); + if (instr->getOpcode() == AMDGPU::RETURN) + return instr; + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *MI = &(*It); + if (MI->getOpcode() == AMDGPU::CONTINUE) + return MI; + } + return nullptr; +} + +bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { + MachineInstr *MI = getReturnInstr(MBB); + bool IsReturn = (MBB->succ_size() == 0); + if (MI) + assert(IsReturn); + else if (IsReturn) + DEBUG( + dbgs() << "BB" << MBB->getNumber() + <<" is return block without RETURN instr\n";); + return IsReturn; +} + +void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), + iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) + DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of +} + +MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { + MachineFunction *Func = MBB->getParent(); + MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); + Func->push_back(NewMBB); //insert to function + for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); + It != E; ++It) { + MachineInstr *MI = Func->CloneMachineInstr(It); + NewMBB->push_back(MI); + } + return NewMBB; +} + +void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( + MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, + MachineBasicBlock *NewBlk) { + MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); + if (BranchMI && isCondBranch(BranchMI) && + getTrueBranch(BranchMI) == OldMBB) + setTrueBranch(BranchMI, NewBlk); +} + +void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { + assert((!MBB->getParent()->getJumpTableInfo() + || MBB->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr; + MachineBasicBlock::iterator Pre = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator It = Pre; + while (It != E) { + if (Pre->getOpcode() == AMDGPU::CONTINUE + && It->getOpcode() == AMDGPU::ENDLOOP) + ContInstr.push_back(Pre); + Pre = It; + ++It; + } + + //delete continue right before endloop + for (unsigned i = 0; i < ContInstr.size(); ++i) + ContInstr[i]->eraseFromParent(); + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + +} + + +bool AMDGPUCFGStructurizer::prepare() { + bool Changed = false; + + //FIXME: if not reducible flow graph, make it so ??? + + DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); + + orderBlocks(FuncRep); + + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks; + + // Add an ExitBlk to loop that don't have one + for (MachineLoopInfo::iterator It = MLI->begin(), + E = MLI->end(); It != E; ++It) { + MachineLoop *LoopRep = (*It); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + + if (ExitingMBBs.size() == 0) { + MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); + if (DummyExitBlk) + RetBlks.push_back(DummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + for (SmallVectorImpl<MachineBasicBlock *>::const_iterator + It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + removeUnconditionalBranch(MBB); + removeRedundantConditionalBranch(MBB); + if (isReturnBlock(MBB)) { + RetBlks.push_back(MBB); + } + assert(MBB->succ_size() <= 2); + } + + if (RetBlks.size() >= 2) { + addDummyExitBlock(RetBlks); + Changed = true; + } + + return Changed; +} + +bool AMDGPUCFGStructurizer::run() { + + //Assume reducible CFG... + DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); + +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); + int NumIter = 0; + bool Finish = false; + MachineBasicBlock *MBB; + bool MakeProgress = false; + int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), + OrderedBlks.end()); + + do { + ++NumIter; + DEBUG( + dbgs() << "numIter = " << NumIter + << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; + ); + + SmallVectorImpl<MachineBasicBlock *>::const_iterator It = + OrderedBlks.begin(); + SmallVectorImpl<MachineBasicBlock *>::const_iterator E = + OrderedBlks.end(); + + SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter = + It; + MachineBasicBlock *SccBeginMBB = nullptr; + int SccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int SccNumIter; // Number of iteration in this SCC. + + while (It != E) { + MBB = *It; + + if (!SccBeginMBB) { + SccBeginIter = It; + SccBeginMBB = MBB; + SccNumIter = 0; + SccNumBlk = NumRemainedBlk; // Init to maximum possible number. + DEBUG( + dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); + dbgs() << "\n"; + ); + } + + if (!isRetiredBlock(MBB)) + patternMatch(MBB); + + ++It; + + bool ContNextScc = true; + if (It == E + || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { + // Just finish one scc. + ++SccNumIter; + int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { + DEBUG( + dbgs() << "Can't reduce SCC " << getSCCNum(MBB) + << ", sccNumIter = " << SccNumIter; + dbgs() << "doesn't make any progress\n"; + ); + ContNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { + SccNumBlk = sccRemainedNumBlk; + It = SccBeginIter; + ContNextScc = false; + DEBUG( + dbgs() << "repeat processing SCC" << getSCCNum(MBB) + << "sccNumIter = " << SccNumIter << '\n'; + ); + } else { + // Finish the current scc. + ContNextScc = true; + } + } else { + // Continue on next component in the current scc. + ContNextScc = false; + } + + if (ContNextScc) + SccBeginMBB = nullptr; + } //while, "one iteration" over the function. + + MachineBasicBlock *EntryMBB = + GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + if (EntryMBB->succ_size() == 0) { + Finish = true; + DEBUG( + dbgs() << "Reduce to one block\n"; + ); + } else { + int NewnumRemainedBlk + = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); + // consider cloned blocks ?? + if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { + MakeProgress = true; + NumRemainedBlk = NewnumRemainedBlk; + } else { + MakeProgress = false; + DEBUG( + dbgs() << "No progress\n"; + ); + } + } + } while (!Finish && MakeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + + // Detach retired Block, release memory. + for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); + It != E; ++It) { + if ((*It).second && (*It).second->IsRetired) { + assert(((*It).first)->getNumber() != -1); + DEBUG( + dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; + ); + (*It).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*It).second; + } + BlockInfoMap.clear(); + LLInfoMap.clear(); + + if (!Finish) { + DEBUG(FuncRep->viewCFG()); + llvm_unreachable("IRREDUCIBLE_CFG"); + } + + return true; +} + + + +void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { + int SccNum = 0; + MachineBasicBlock *MBB; + for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); + ++It, ++SccNum) { + const std::vector<MachineBasicBlock *> &SccNext = *It; + for (std::vector<MachineBasicBlock *>::const_iterator + blockIter = SccNext.begin(), blockEnd = SccNext.end(); + blockIter != blockEnd; ++blockIter) { + MBB = *blockIter; + OrderedBlks.push_back(MBB); + recordSccnum(MBB, SccNum); + } + } + + //walk through all the block in func to check for unreachable + typedef GraphTraits<MachineFunction *> GTM; + MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); + for (; It != E; ++It) { + MachineBasicBlock *MBB = &(*It); + SccNum = getSCCNum(MBB); + if (SccNum == INVALIDSCCNUM) + dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; + } +} + +int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { + int NumMatch = 0; + int CurMatch; + + DEBUG( + dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; + ); + + while ((CurMatch = patternMatchGroup(MBB)) > 0) + NumMatch += CurMatch; + + DEBUG( + dbgs() << "End patternMatch BB" << MBB->getNumber() + << ", numMatch = " << NumMatch << "\n"; + ); + + return NumMatch; +} + +int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { + int NumMatch = 0; + NumMatch += loopendPatternMatch(); + NumMatch += serialPatternMatch(MBB); + NumMatch += ifPatternMatch(MBB); + return NumMatch; +} + + +int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { + if (MBB->succ_size() != 1) + return 0; + + MachineBasicBlock *childBlk = *MBB->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) + return 0; + + mergeSerialBlock(MBB, childBlk); + ++numSerialPatternMatch; + return 1; +} + +int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { + //two edges + if (MBB->succ_size() != 2) + return 0; + if (hasBackEdge(MBB)) + return 0; + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + if (!BranchMI) + return 0; + + assert(isCondBranch(BranchMI)); + int NumMatch = 0; + + MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); + NumMatch += serialPatternMatch(TrueMBB); + NumMatch += ifPatternMatch(TrueMBB); + MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); + NumMatch += serialPatternMatch(FalseMBB); + NumMatch += ifPatternMatch(FalseMBB); + MachineBasicBlock *LandBlk; + int Cloned = 0; + + assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); + // TODO: Simplify + if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 + && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { + // Diamond pattern + LandBlk = *TrueMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { + // Triangle pattern, false is empty + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && *FalseMBB->succ_begin() == TrueMBB) { + // Triangle pattern, true is empty + // We reverse the predicate to make a triangle, empty false pattern; + std::swap(TrueMBB, FalseMBB); + reversePredicateSetter(MBB->end()); + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { + LandBlk = *FalseMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 + && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { + LandBlk = *TrueMBB->succ_begin(); + } else { + return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (LandBlk && + ((TrueMBB && TrueMBB->pred_size() > 1) + || (FalseMBB && FalseMBB->pred_size() > 1))) { + Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); + } + + if (TrueMBB && TrueMBB->pred_size() > 1) { + TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); + ++Cloned; + } + + if (FalseMBB && FalseMBB->pred_size() > 1) { + FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); + ++Cloned; + } + + mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); + + ++numIfPatternMatch; + + numClonedBlock += Cloned; + + return 1 + Cloned + NumMatch; +} + +int AMDGPUCFGStructurizer::loopendPatternMatch() { + std::deque<MachineLoop *> NestedLoops; + for (auto &It: *MLI) + for (MachineLoop *ML : depth_first(It)) + NestedLoops.push_front(ML); + + if (NestedLoops.size() == 0) + return 0; + + // Process nested loop outside->inside (we did push_front), + // so "continue" to a outside loop won't be mistaken as "break" + // of the current loop. + int Num = 0; + for (MachineLoop *ExaminedLoop : NestedLoops) { + if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) + continue; + DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); + int NumBreak = mergeLoop(ExaminedLoop); + if (NumBreak == -1) + break; + Num += NumBreak; + } + return Num; +} + +int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); + DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); + // We assume a single ExitBlk + MBBVector ExitBlks; + LoopRep->getExitBlocks(ExitBlks); + SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet; + for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) + ExitBlkSet.insert(ExitBlks[i]); + assert(ExitBlkSet.size() == 1); + MachineBasicBlock *ExitBlk = *ExitBlks.begin(); + assert(ExitBlk && "Loop has several exit block"); + MBBVector LatchBlks; + typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits; + InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), + PE = InvMBBTraits::child_end(LoopHeader); + for (; PI != PE; PI++) { + if (LoopRep->contains(*PI)) + LatchBlks.push_back(*PI); + } + + for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) + mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); + for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) + settleLoopcontBlock(LatchBlks[i], LoopHeader); + int Match = 0; + do { + Match = 0; + Match += serialPatternMatch(LoopHeader); + Match += ifPatternMatch(LoopHeader); + } while (Match > 0); + mergeLooplandBlock(LoopHeader, ExitBlk); + MachineLoop *ParentLoop = LoopRep->getParentLoop(); + if (ParentLoop) + MLI->changeLoopFor(LoopHeader, ParentLoop); + else + MLI->removeBlock(LoopHeader); + Visited[LoopRep] = true; + return 1; +} + +int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, + MachineBasicBlock *LoopHeader) { + int NumCont = 0; + SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB; + typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM; + GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), + E = GTIM::child_end(LoopHeader); + for (; It != E; ++It) { + MachineBasicBlock *MBB = *It; + if (LoopRep->contains(MBB)) { + handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), + LoopHeader, LoopRep); + ContMBB.push_back(MBB); + ++NumCont; + } + } + + for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), + E = ContMBB.end(); It != E; ++It) { + (*It)->removeSuccessor(LoopHeader); + } + + numLoopcontPatternMatch += NumCont; + + return NumCont; +} + + +bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( + MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { + if (Src1MBB->succ_size() == 0) { + MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); + if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { + MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; + if (TheEntry) { + DEBUG( + dbgs() << "isLoopContBreakBlock yes src1 = BB" + << Src1MBB->getNumber() + << " src2 = BB" << Src2MBB->getNumber() << "\n"; + ); + return true; + } + } + } + return false; +} + +int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); + if (Num == 0) { + DEBUG( + dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + ); + Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); + } + return Num; +} + +int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = 0; + MachineBasicBlock *DownBlk; + + //trueBlk could be the common post dominator + DownBlk = TrueMBB; + + DEBUG( + dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() + << " true = BB" << TrueMBB->getNumber() + << ", numSucc=" << TrueMBB->succ_size() + << " false = BB" << FalseMBB->getNumber() << "\n"; + ); + + while (DownBlk) { + DEBUG( + dbgs() << "check down = BB" << DownBlk->getNumber(); + ); + + if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { + DEBUG( + dbgs() << " working\n"; + ); + + Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); + Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); + + numClonedBlock += Num; + Num += serialPatternMatch(*HeadMBB->succ_begin()); + Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); + Num += ifPatternMatch(HeadMBB); + assert(Num > 0); + + break; + } + DEBUG( + dbgs() << " not working\n"; + ); + DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; + } // walk down the postDomTree + + return Num; +} + +void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( + MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { + dbgs() << "head = BB" << HeadMBB->getNumber() + << " size = " << HeadMBB->size(); + if (Detail) { + dbgs() << "\n"; + HeadMBB->print(dbgs()); + dbgs() << "\n"; + } + + if (TrueMBB) { + dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " + << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + TrueMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (FalseMBB) { + dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " + << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + FalseMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (LandMBB) { + dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " + << LandMBB->size() << " numPred = " << LandMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + LandMBB->print(dbgs()); + dbgs() << "\n"; + } + } + + dbgs() << "\n"; +} + +int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr) { + bool MigrateTrue = false; + bool MigrateFalse = false; + + MachineBasicBlock *LandBlk = *LandMBBPtr; + + assert((!TrueMBB || TrueMBB->succ_size() <= 1) + && (!FalseMBB || FalseMBB->succ_size() <= 1)); + + if (TrueMBB == FalseMBB) + return 0; + + MigrateTrue = needMigrateBlock(TrueMBB); + MigrateFalse = needMigrateBlock(FalseMBB); + + if (!MigrateTrue && !MigrateFalse) + return 0; + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) + MigrateTrue = true; + if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) + MigrateFalse = true; + + DEBUG( + dbgs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (!MigrateTrue || !MigrateFalse) { + // XXX: We have an opportunity here to optimize the "branch into if" case + // here. Branch into if looks like this: + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true + // \ / + // done + // + // The diamond_head block begins the "if" and the diamond_true block + // is the block being "branched into". + // + // If MigrateTrue is true, then TrueBB is the block being "branched into" + // and if MigrateFalse is true, then FalseBB is the block being + // "branched into" + // + // Here is the pseudo code for how I think the optimization should work: + // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. + // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. + // 3. Move the branch instruction from diamond_head into its own basic + // block (new_block). + // 4. Add an unconditional branch from diamond_head to new_block + // 5. Replace the branch instruction in branch_from with an unconditional + // branch to new_block. If branch_from has multiple predecessors, then + // we need to replace the True/False block in the branch + // instruction instead of replacing it. + // 6. Change the condition of the branch instruction in new_block from + // COND to (COND || GPR0) + // + // In order insert these MOV instruction, we will need to use the + // RegisterScavenger. Usually liveness stops being tracked during + // the late machine optimization passes, however if we implement + // bool TargetRegisterInfo::requiresRegisterScavenging( + // const MachineFunction &MF) + // and have it return true, liveness will be tracked correctly + // by generic optimization passes. We will also need to make sure that + // all of our target-specific passes that run after regalloc and before + // the CFGStructurizer track liveness and we will need to modify this pass + // to correctly track liveness. + // + // After the above changes, the new CFG should look like this: + // entry + // / | + // diamond_head branch_from + // \ / + // new_block + // / | + // diamond_false diamond_true + // \ / + // done + // + // Without this optimization, we are forced to duplicate the diamond_true + // block and we will end up with a CFG like this: + // + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true diamond_true (duplicate) + // \ / | + // done --------------------| + // + // Duplicating diamond_true can be very costly especially if it has a + // lot of instructions. + return 0; + } + + int NumNewBlk = 0; + + bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); + + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" + MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); + + if (LandBlkHasOtherPred) { + llvm_unreachable("Extra register needed to handle CFG"); + unsigned CmpResReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + llvm_unreachable("Extra compare instruction needed to handle CFG"); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, + CmpResReg, DebugLoc()); + } + + // XXX: We are running this after RA, so creating virtual registers will + // cause an assertion failure in the PostRA scheduling pass. + unsigned InitReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, + DebugLoc()); + + if (MigrateTrue) { + migrateInstruction(TrueMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + llvm_unreachable("Extra register needed to handle CFG"); + } + insertInstrBefore(I, AMDGPU::ELSE); + + if (MigrateFalse) { + migrateInstruction(FalseMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + llvm_unreachable("Extra register needed to handle CFG"); + } + + if (LandBlkHasOtherPred) { + // add endif + insertInstrBefore(I, AMDGPU::ENDIF); + + // put initReg = 2 to other predecessors of landBlk + for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), + PE = LandBlk->pred_end(); PI != PE; ++PI) { + MachineBasicBlock *MBB = *PI; + if (MBB != TrueMBB && MBB != FalseMBB) + llvm_unreachable("Extra register needed to handle CFG"); + } + } + DEBUG( + dbgs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // update landBlk + *LandMBBPtr = LandBlk; + + return NumNewBlk; +} + +void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop) { + DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() + << " header = BB" << ContMBB->getNumber() << "\n"; + dbgs() << "Trying to continue loop-depth = " + << getLoopDepth(ContLoop) + << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); + settleLoopcontBlock(ContingMBB, ContMBB); +} + +void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + DEBUG( + dbgs() << "serialPattern BB" << DstMBB->getNumber() + << " <= BB" << SrcMBB->getNumber() << "\n"; + ); + DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); + + DstMBB->removeSuccessor(SrcMBB); + cloneSuccessorList(DstMBB, SrcMBB); + + removeSuccessor(SrcMBB); + MLI->removeBlock(SrcMBB); + retireBlock(SrcMBB); +} + +void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { + assert (TrueMBB); + DEBUG( + dbgs() << "ifPattern BB" << MBB->getNumber(); + dbgs() << "{ "; + if (TrueMBB) { + dbgs() << "BB" << TrueMBB->getNumber(); + } + dbgs() << " } else "; + dbgs() << "{ "; + if (FalseMBB) { + dbgs() << "BB" << FalseMBB->getNumber(); + } + dbgs() << " }\n "; + dbgs() << "landBlock: "; + if (!LandMBB) { + dbgs() << "NULL"; + } else { + dbgs() << "BB" << LandMBB->getNumber(); + } + dbgs() << "\n"; + ); + + int OldOpcode = BranchMI->getOpcode(); + DebugLoc BranchDL = BranchMI->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + MachineBasicBlock::iterator I = BranchMI; + insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), + BranchDL); + + if (TrueMBB) { + MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); + MBB->removeSuccessor(TrueMBB); + if (LandMBB && TrueMBB->succ_size()!=0) + TrueMBB->removeSuccessor(LandMBB); + retireBlock(TrueMBB); + MLI->removeBlock(TrueMBB); + } + + if (FalseMBB) { + insertInstrBefore(I, AMDGPU::ELSE); + MBB->splice(I, FalseMBB, FalseMBB->begin(), + FalseMBB->end()); + MBB->removeSuccessor(FalseMBB); + if (LandMBB && FalseMBB->succ_size() != 0) + FalseMBB->removeSuccessor(LandMBB); + retireBlock(FalseMBB); + MLI->removeBlock(FalseMBB); + } + insertInstrBefore(I, AMDGPU::ENDIF); + + BranchMI->eraseFromParent(); + + if (LandMBB && TrueMBB && FalseMBB) + MBB->addSuccessor(LandMBB); + +} + +void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + + insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); + insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); + DstBlk->addSuccessor(LandMBB); + DstBlk->removeSuccessor(DstBlk); +} + + +void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); + assert(BranchMI && isCondBranch(BranchMI)); + DebugLoc DL = BranchMI->getDebugLoc(); + MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); + MachineBasicBlock::iterator I = BranchMI; + if (TrueBranch != LandMBB) + reversePredicateSetter(I); + insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); + insertInstrBefore(I, AMDGPU::BREAK); + insertInstrBefore(I, AMDGPU::ENDIF); + //now branchInst can be erase safely + BranchMI->eraseFromParent(); + //now take care of successors, retire blocks + ExitingMBB->removeSuccessor(LandMBB); +} + +void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB) { + DEBUG(dbgs() << "settleLoopcontBlock conting = BB" + << ContingMBB->getNumber() + << ", cont = BB" << ContMBB->getNumber() << "\n";); + + MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); + if (MI) { + assert(isCondBranch(MI)); + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + int OldOpcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); + + bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); + + if (!UseContinueLogical) { + int BranchOpcode = + TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : + getBranchZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); + insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); + } else { + int BranchOpcode = + TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : + getContinueZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + } + + MI->eraseFromParent(); + } else { + // if we've arrived here then we've already erased the branch instruction + // travel back up the basic block to see the last reference of our debug + // location we've just inserted that reference here so it should be + // representative insertEnd to ensure phi-moves, if exist, go before the + // continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, + getLastDebugLocInBB(ContingMBB)); + } +} + +int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { + int Cloned = 0; + assert(PreMBB->isSuccessor(SrcMBB)); + while (SrcMBB && SrcMBB != DstMBB) { + assert(SrcMBB->succ_size() == 1); + if (SrcMBB->pred_size() > 1) { + SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); + ++Cloned; + } + + PreMBB = SrcMBB; + SrcMBB = *SrcMBB->succ_begin(); + } + + return Cloned; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB) { + assert(PredMBB->isSuccessor(MBB) && + "succBlk is not a prececessor of curBlk"); + + MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions + replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); + //srcBlk, oldBlk, newBlk + + PredMBB->removeSuccessor(MBB); + PredMBB->addSuccessor(CloneMBB); + + // add all successor to cloneBlk + cloneSuccessorList(CloneMBB, MBB); + + numClonedInstr += MBB->size(); + + DEBUG( + dbgs() << "Cloned block: " << "BB" + << MBB->getNumber() << "size " << MBB->size() << "\n"; + ); + + SHOWNEWBLK(CloneMBB, "result of Cloned block: "); + + return CloneMBB; +} + +void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator SpliceEnd; + //look for the input branchinstr, not the AMDGPU branchinstr + MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); + if (!BranchMI) { + DEBUG( + dbgs() << "migrateInstruction don't see branch instr\n" ; + ); + SpliceEnd = SrcMBB->end(); + } else { + DEBUG( + dbgs() << "migrateInstruction see branch instr\n" ; + BranchMI->dump(); + ); + SpliceEnd = BranchMI; + } + DEBUG( + dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << "\n"; + ); + + //splice insert before insertPos + DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); + + DEBUG( + dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << "\n"; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + if (!LoopHeader || !LoopLatch) + return nullptr; + MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); + // Is LoopRep an infinite loop ? + if (!BranchMI || !isUncondBranch(BranchMI)) + return nullptr; + + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); + MachineBasicBlock::iterator I = BranchMI; + unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); + llvm_unreachable("Extra register needed to handle CFG"); + MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); + MachineInstrBuilder MIB(*FuncRep, NewMI); + MIB.addMBB(LoopHeader); + MIB.addReg(ImmReg, false); + SHOWNEWINSTR(NewMI); + BranchMI->eraseFromParent(); + LoopLatch->addSuccessor(DummyExitBlk); + + return DummyExitBlk; +} + +void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { + MachineInstr *BranchMI; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((BranchMI = getLoopendBlockBranchInstr(MBB)) + && isUncondBranch(BranchMI)) { + DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); + BranchMI->eraseFromParent(); + } +} + +void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( + MachineBasicBlock *MBB) { + if (MBB->succ_size() != 2) + return; + MachineBasicBlock *MBB1 = *MBB->succ_begin(); + MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); + if (MBB1 != MBB2) + return; + + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + assert(BranchMI && isCondBranch(BranchMI)); + DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); + BranchMI->eraseFromParent(); + SHOWNEWBLK(MBB1, "Removing redundant successor"); + MBB->removeSuccessor(MBB1); +} + +void AMDGPUCFGStructurizer::addDummyExitBlock( + SmallVectorImpl<MachineBasicBlock*> &RetMBB) { + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); + + for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(), + E = RetMBB.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + MachineInstr *MI = getReturnInstr(MBB); + if (MI) + MI->eraseFromParent(); + MBB->addSuccessor(DummyExitBlk); + DEBUG( + dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() + << " successors\n"; + ); + } + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); +} + +void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { + while (MBB->succ_size()) + MBB->removeSuccessor(*MBB->succ_begin()); +} + +void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, + int SccNum) { + BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; + if (!srcBlkInfo) + srcBlkInfo = new BlockInformation(); + srcBlkInfo->SccNum = SccNum; +} + +void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { + DEBUG( + dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; + ); + + BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; + + if (!SrcBlkInfo) + SrcBlkInfo = new BlockInformation(); + + SrcBlkInfo->IsRetired = true; + assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 + && "can't retire block yet"); +} + +void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, + MachineBasicBlock *MBB) { + MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; + if (!MBB) { + MBB = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(MBB); //insert to function + SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); + } + TheEntry = MBB; + DEBUG( + dbgs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << MBB->getNumber() << "\n"; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2) { + + if (PDT->dominates(MBB1, MBB2)) + return MBB1; + if (PDT->dominates(MBB2, MBB1)) + return MBB2; + + MachineDomTreeNode *Node1 = PDT->getNode(MBB1); + MachineDomTreeNode *Node2 = PDT->getNode(MBB2); + + // Handle newly cloned node. + if (!Node1 && MBB1->succ_size() == 1) + return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); + if (!Node2 && MBB2->succ_size() == 1) + return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); + + if (!Node1 || !Node2) + return nullptr; + + Node1 = Node1->getIDom(); + while (Node1) { + if (PDT->dominates(Node1, Node2)) + return Node1->getBlock(); + Node1 = Node1->getIDom(); + } + + return nullptr; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom( + std::set<MachineBasicBlock *> &MBBs) { + MachineBasicBlock *CommonDom; + std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin(); + std::set<MachineBasicBlock *>::const_iterator E = MBBs.end(); + for (CommonDom = *It; It != E && CommonDom; ++It) { + MachineBasicBlock *MBB = *It; + if (MBB != CommonDom) + CommonDom = findNearestCommonPostDom(MBB, CommonDom); + } + + DEBUG( + dbgs() << "Common post dominator for exit blocks is "; + if (CommonDom) + dbgs() << "BB" << CommonDom->getNumber() << "\n"; + else + dbgs() << "NULL\n"; + ); + + return CommonDom; +} + +char AMDGPUCFGStructurizer::ID = 0; + +} // end anonymous namespace + + +INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) + +FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { + return new AMDGPUCFGStructurizer(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h new file mode 100644 index 0000000..eaffb85 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -0,0 +1,704 @@ +//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeT.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODET_H +#define AMDKERNELCODET_H + +#include <cstddef> +#include <cstdint> + +//---------------------------------------------------------------------------// +// AMD Kernel Code, and its dependencies // +//---------------------------------------------------------------------------// + +typedef uint8_t hsa_powertwo8_t; +typedef uint32_t hsa_ext_code_kind_t; +typedef uint8_t hsa_ext_brig_profile8_t; +typedef uint8_t hsa_ext_brig_machine_model8_t; +typedef uint64_t hsa_ext_control_directive_present64_t; +typedef uint16_t hsa_ext_exception_kind16_t; +typedef uint32_t hsa_ext_code_kind32_t; + +typedef struct hsa_dim3_s { + uint32_t x; + uint32_t y; + uint32_t z; +} hsa_dim3_t; + +/// The version of the amd_*_code_t struct. Minor versions must be +/// backward compatible. +typedef uint32_t amd_code_version32_t; +enum amd_code_version_t { + AMD_CODE_VERSION_MAJOR = 0, + AMD_CODE_VERSION_MINOR = 1 +}; + +/// The values used to define the number of bytes to use for the +/// swizzle element size. +enum amd_element_byte_size_t { + AMD_ELEMENT_2_BYTES = 0, + AMD_ELEMENT_4_BYTES = 1, + AMD_ELEMENT_8_BYTES = 2, + AMD_ELEMENT_16_BYTES = 3 +}; + +/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and +/// COMPUTE_PGM_RSRC2 registers. +typedef uint64_t amd_compute_pgm_resource_register64_t; + +/// Every amd_*_code_t has the following properties, which are composed of +/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), +/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount +/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. +/// +/// (Note that bit fields cannot be used as their layout is +/// implementation defined in the C standard and so cannot be used to +/// specify an ABI) +typedef uint32_t amd_code_property32_t; +enum amd_code_property_mask_t { + + /// Enable the setup of the SGPR user data registers + /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + /// for initial register state. + /// + /// The total number of SGPRuser data registers requested must not + /// exceed 16. Any requests beyond 16 will be ignored. + /// + /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + /// SGPR user data registers enabled up to 16). + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + /// Control wave ID base counter for GDS ordered-append. Used to set + /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + /// ORDERED_APPEND_MODE also needs to be settable) + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /// The interleave (swizzle) element size in bytes required by the + /// code for private memory. This must be 2, 4, 8 or 16. This value + /// is provided to the finalizer when it is invoked and is recorded + /// here. The hardware will interleave the memory requests of each + /// lane of a wavefront by this element size to ensure each + /// work-item gets a distinct memory memory location. Therefore, the + /// finalizer ensures that all load and store operations done to + /// private memory do not exceed this size. For example, if the + /// element size is 4 (32-bits or dword) and a 64-bit value must be + /// loaded, the finalizer will generate two 32-bit loads. This + /// ensures that the interleaving will get the work-item + /// specific dword for both halves of the 64-bit value. If it just + /// did a 64-bit load then it would get one dword which belonged to + /// its own work-item, but the second dword would belong to the + /// adjacent lane work-item since the interleaving is in dwords. + /// + /// The value used must match the value that the runtime configures + /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + /// is generally DWORD. + /// + /// Use values from the amd_element_byte_size_t enum. + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /// Are global memory addresses 64 bits. Must match + /// amd_kernel_code_t.hsail_machine_model == + /// HSA_MACHINE_LARGE. Must also match + /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /// Indicate if the generated ISA is using a dynamically sized call + /// stack. This can happen if calls are implemented using a call + /// stack and recursion, alloca or calls to indirect functions are + /// present. In these cases the Finalizer cannot compute the total + /// private segment size at compile time. In this case the + /// workitem_private_segment_byte_size only specifies the statically + /// know private segment size, and additional space must be added + /// for the call stack. + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /// Indicate if code generated has support for debugging. + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT +}; + +/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// control directives. These control how the finalizer generates code. This +/// struct is used both as an argument to hsaFinalizeKernel to specify values for +/// the control directives, and is used in HsaKernelCode to record the values of +/// the control directives that the finalize used when generating the code which +/// either came from the finalizer argument or explicit HSAIL control +/// directives. See the definition of the control directives in HSA Programmer's +/// Reference Manual which also defines how the values specified as finalizer +/// arguments have to agree with the control directives in the HSAIL code. +typedef struct hsa_ext_control_directives_s { + /// This is a bit set indicating which control directives have been + /// specified. If the value is 0 then there are no control directives specified + /// and the rest of the fields can be ignored. The bits are accessed using the + /// hsa_ext_control_directives_present_mask_t. Any control directive that is not + /// enabled in this bit set must have the value of all 0s. + hsa_ext_control_directive_present64_t enabled_control_directives; + + /// If enableBreakExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. If the kernel being finalized + /// has any enablebreakexceptions control directives, then the values specified + /// by this argument are unioned with the values in these control + /// directives. If any of the functions the kernel calls have an + /// enablebreakexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_break_exceptions; + + /// If enableDetectExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. However, an implementation + /// should endeavour to make the performance impact small. If the kernel being + /// finalized has any enabledetectexceptions control directives, then the + /// values specified by this argument are unioned with the values in these + /// control directives. If any of the functions the kernel calls have an + /// enabledetectexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_detect_exceptions; + + /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of + /// dynamic group segment can be allocated for a dispatch, otherwise the value + /// specifies the maximum number of bytes of dynamic group segment that can be + /// allocated for a dispatch. If the kernel being finalized has any + /// maxdynamicsize control directives, then the values must be the same, and + /// must be the same as this argument if it is enabled. This value can be used + /// by the finalizer to determine the maximum number of bytes of group memory + /// used by each work-group by adding this value to the group memory required + /// for all group segment variables used by the kernel and all functions it + /// calls, and group memory used to implement other HSAIL features such as + /// fbarriers and the detect exception operations. This can allow the finalizer + /// to determine the expected number of work-groups that can be executed by a + /// compute unit and allow more resources to be allocated to the work-items if + /// it is known that fewer work-groups can be executed due to group memory + /// limitations. + uint32_t max_dynamic_group_size; + + /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater + /// than 0. See HSA Programmer's Reference Manual description of + /// maxflatgridsize control directive. + uint32_t max_flat_grid_size; + + /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be + /// greater than 0. See HSA Programmer's Reference Manual description of + /// maxflatworkgroupsize control directive. + uint32_t max_flat_workgroup_size; + + /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the + /// finalizer is free to generate ISA that may result in any number of + /// work-groups executing on a single compute unit. Otherwise, the finalizer + /// should attempt to generate ISA that will allow the specified number of + /// work-groups to execute on a single compute unit. This is only a hint and + /// can be ignored by the finalizer. If the kernel being finalized, or any of + /// the functions it calls, has a requested control directive, then the values + /// must be the same. This can be used to determine the number of resources + /// that should be allocated to a single work-group and work-item. For example, + /// a low value may allow more resources to be allocated, resulting in higher + /// per work-item performance, as it is known there will never be more than the + /// specified number of work-groups actually executing on the compute + /// unit. Conversely, a high value may allocate fewer resources, resulting in + /// lower per work-item performance, which is offset by the fact it allows more + /// work-groups to actually execute on the compute unit. + uint32_t requested_workgroups_per_cu; + + /// If not enabled then all elements for Dim3 must be 0, otherwise every + /// element must be greater than 0. See HSA Programmer's Reference Manual + /// description of requiredgridsize control directive. + hsa_dim3_t required_grid_size; + + /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be + /// 0, and the produced code can be dispatched with any legal work-group range + /// consistent with the dispatch dimensions. Otherwise, the code produced must + /// always be dispatched with the specified work-group range. No element of the + /// specified range must be 0. It must be consistent with required_dimensions + /// and max_flat_workgroup_size. If the kernel being finalized, or any of the + /// functions it calls, has a requiredworkgroupsize control directive, then the + /// values must be the same. Specifying a value can allow the finalizer to + /// optimize work-group id operations, and if the number of work-items in the + /// work-group is less than the WAVESIZE then barrier operations can be + /// optimized to just a memory fence. + hsa_dim3_t required_workgroup_size; + + /// If requiredDim is not enabled then must be 0 and the produced kernel code + /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is + /// 1..3 and the code produced must only be dispatched with a dimension that + /// matches. Other values are illegal. If the kernel being finalized, or any of + /// the functions it calls, has a requireddimsize control directive, then the + /// values must be the same. This can be used to optimize the code generated to + /// compute the absolute and flat work-group and work-item id, and the dim + /// HSAIL operations. + uint8_t required_dim; + + /// Reserved. Must be 0. + uint8_t reserved[75]; +} hsa_ext_control_directives_t; + +/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel +/// Code Object to set up the hardware to execute the kernel dispatch. +/// +/// Initial Kernel Register State. +/// +/// Initial kernel register state will be set up by CP/SPI prior to the start +/// of execution of every wavefront. This is limited by the constraints of the +/// current hardware. +/// +/// The order of the SGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enable_sgpr_* bit fields. The register numbers used for enabled registers +/// are dense starting at SGPR0: the first enabled register is SGPR0, the next +/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR +/// number. +/// +/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and +/// apply to all waves of the grid. It is possible to specify more than 16 User +/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 +/// are actually initialized. These are then immediately followed by the System +/// SGPRs that are set up by ADC/SPI and can have different values for each wave +/// of the grid dispatch. +/// +/// SGPR register initial state is defined as follows: +/// +/// Private Segment Buffer (enable_sgpr_private_segment_buffer): +/// Number of User SGPR registers: 4. V# that can be used, together with +/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg +/// segments using a segment address. It must be set as follows: +/// - Base address: of the scratch memory area used by the dispatch. It +/// does not include the scratch wave offset. It will be the per process +/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for +/// example there may be a per pipe offset, or per AQL Queue offset). +/// - Stride + data_format: Element Size * Index Stride (???) +/// - Cache swizzle: ??? +/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for +/// scratch) +/// - Num records: Flat Scratch Work Item Size / Element Size (???) +/// - Dst_sel_*: ??? +/// - Num_format: ??? +/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must +/// agree with amd_kernel_code_t.privateElementSize) +/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must +/// be number of wavefront lanes for scratch, must agree with +/// amd_kernel_code_t.wavefrontSize) +/// - Add tid enable: 1 +/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, +/// - Hash_enable: ??? +/// - Heap: ??? +/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE +/// - Type: 0 (a buffer) (???) +/// +/// Dispatch Ptr (enable_sgpr_dispatch_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet +/// for kernel actually executing. +/// +/// Queue Ptr (enable_sgpr_queue_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for +/// AQL queue on which the dispatch packet was queued. +/// +/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): +/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This +/// is directly copied from the kernargPtr in the dispatch packet. Having CP +/// load it once avoids loading it at the beginning of every wavefront. +/// +/// Dispatch Id (enable_sgpr_dispatch_id): +/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch +/// packet being executed. +/// +/// Flat Scratch Init (enable_sgpr_flat_scratch_init): +/// Number of User SGPR registers: 2. This is 2 SGPRs. +/// +/// For CI/VI: +/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE +/// to base of memory for scratch for this dispatch. This is the same offset +/// used in computing the Scratch Segment Buffer base address. The value of +/// Scratch Wave Offset must be added by the kernel code and moved to +/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. +/// +/// The second SGPR is 32 bit byte size of a single work-item’s scratch +/// memory usage. This is directly loaded from the dispatch packet Private +/// Segment Byte Size and rounded up to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in +/// flat memory instructions. Having CP load it once avoids loading it at +/// the beginning of every wavefront. +/// +/// For PI: +/// This is the 64 bit base address of the scratch backing memory for +/// allocated by CP for this dispatch. +/// +/// Private Segment Size (enable_sgpr_private_segment_size): +/// Number of User SGPR registers: 1. The 32 bit byte size of a single +/// work-item’s scratch memory allocation. This is the value from the dispatch +/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// Having CP load it once avoids loading it at the beginning of every +/// wavefront. +/// +/// \todo [This will not be used for CI/VI since it is the same value as +/// the second SGPR of Flat Scratch Init. However, it is need for PI which +/// changes meaning of Flat Scratchg Init..] +/// +/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the X dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). +/// +/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Y dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Z dimension for the grid being executed. Computed +/// from the fields in the HsaDispatchPacket as +/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Work-Group Id X (enable_sgpr_workgroup_id_x): +/// Number of System SGPR registers: 1. 32 bit work group id in X dimension +/// of grid for wavefront. Always present. +/// +/// Work-Group Id Y (enable_sgpr_workgroup_id_y): +/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension +/// of grid for wavefront. +/// +/// Work-Group Id Z (enable_sgpr_workgroup_id_z): +/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension +/// of grid for wavefront. If present then Work-group Id Y will also be +/// present +/// +/// Work-Group Info (enable_sgpr_workgroup_info): +/// Number of System SGPR registers: 1. {first_wave, 14’b0000, +/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} +/// +/// Private Segment Wave Byte Offset +/// (enable_sgpr_private_segment_wave_byte_offset): +/// Number of System SGPR registers: 1. 32 bit byte offset from base of +/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg +/// segment address when using Scratch Segment Buffer. It must be added to +/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. +/// +/// +/// The order of the VGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enableVgpr* bit fields. The register numbers used for enabled registers +/// are dense starting at VGPR0: the first enabled register is VGPR0, the next +/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR +/// number. +/// +/// VGPR register initial state is defined as follows: +/// +/// Work-Item Id X (always initialized): +/// Number of registers: 1. 32 bit work item id in X dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Y dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Z dimension of work-group +/// for wavefront lane. +/// +/// +/// The setting of registers is being done by existing GPU hardware as follows: +/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data +/// registers. +/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any +/// combination including none. +/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot +/// be added into the value Flat Scratch Offset which would avoid the +/// Finalizer generated prolog having to do the add. +/// 4) The VGPRs are set by SPI which only supports specifying either (X), +/// (X, Y) or (X, Y, Z). +/// +/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so +/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and +/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. +/// +/// The global segment can be accessed either using flat operations or buffer +/// operations. If buffer operations are used then the Global Buffer used to +/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a +/// segment address is not passed into the kernel code by CP since its base +/// address is always 0. Instead the Finalizer generates prolog code to +/// initialize 4 SGPRs with a V# that has the following properties, and then +/// uses that in the buffer instructions: +/// - base address of 0 +/// - no swizzle +/// - ATC=1 +/// - MTYPE set to support memory coherence specified in +/// amd_kernel_code_t.globalMemoryCoherence +/// +/// When the Global Buffer is used to access the Kernarg segment, must add the +/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. +/// Alternatively scalar loads can be used if the kernarg offset is uniform, as +/// the kernarg segment is constant for the duration of the kernel execution. +/// +typedef struct amd_kernel_code_s { + /// The AMD major version of the Code Object. Must be the value + /// AMD_CODE_VERSION_MAJOR. + amd_code_version32_t amd_code_version_major; + + /// The AMD minor version of the Code Object. Minor versions must be + /// backward compatible. Must be the value + /// AMD_CODE_VERSION_MINOR. + amd_code_version32_t amd_code_version_minor; + + /// The byte size of this struct. Must be set to + /// sizeof(amd_kernel_code_t). Used for backward + /// compatibility. + uint32_t struct_byte_size; + + /// The target chip instruction set for which code has been + /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration + /// in sc/Interface/SCCommon.h. + uint32_t target_chip; + + /// Byte offset (possibly negative) from start of amd_kernel_code_t + /// object to kernel's entry point instruction. The actual code for + /// the kernel is required to be 256 byte aligned to match hardware + /// requirements (SQ cache line is 16). The code must be position + /// independent code (PIC) for AMD devices to give runtime the + /// option of copying code to discrete GPU memory or APU L2 + /// cache. The Finalizer should endeavour to allocate all kernel + /// machine code in contiguous memory pages so that a device + /// pre-fetcher will tend to only pre-fetch Kernel Code objects, + /// improving cache performance. + int64_t kernel_code_entry_byte_offset; + + /// Range of bytes to consider prefetching expressed as an offset + /// and size. The offset is from the start (possibly negative) of + /// amd_kernel_code_t object. Set both to 0 if no prefetch + /// information is available. + /// + /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did + /// not make the size a uint64_t as prefetching more than 4GiB seems + /// excessive. + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /// Number of bytes of scratch backing memory required for full + /// occupancy of target chip. This takes into account the number of + /// bytes of scratch per work-item, the wavefront size, the maximum + /// number of wavefronts per CU, and the number of CUs. This is an + /// upper limit on scratch. If the grid being dispatched is small it + /// may only need less than this. If the kernel uses no scratch, or + /// the Finalizer has not computed this value, it must be 0. + uint64_t max_scratch_backing_memory_byte_size; + + /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + /// COMPUTE_PGM_RSRC2 registers. + amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; + + /// Code properties. See amd_code_property_mask_t for a full list of + /// properties. + amd_code_property32_t code_properties; + + /// The amount of memory required for the combined private, spill + /// and arg segments for a work-item in bytes. If + /// is_dynamic_callstack is 1 then additional space must be added to + /// this value for the call stack. + uint32_t workitem_private_segment_byte_size; + + /// The amount of group segment memory required by a work-group in + /// bytes. This does not include any dynamically allocated group + /// segment memory that may be added when the kernel is + /// dispatched. + uint32_t workgroup_group_segment_byte_size; + + /// Number of byte of GDS required by kernel dispatch. Must be 0 if + /// not using GDS. + uint32_t gds_segment_byte_size; + + /// The size in bytes of the kernarg segment that holds the values + /// of the arguments to the kernel. This could be used by CP to + /// prefetch the kernarg segment pointed to by the dispatch packet. + uint64_t kernarg_segment_byte_size; + + /// Number of fbarrier's used in the kernel and all functions it + /// calls. If the implementation uses group memory to allocate the + /// fbarriers then that amount must already be included in the + /// workgroup_group_segment_byte_size total. + uint32_t workgroup_fbarrier_count; + + /// Number of scalar registers used by a wavefront. This includes + /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + uint16_t wavefront_sgpr_count; + + /// Number of vector registers used by each work-item. Used to set + /// COMPUTE_PGM_RSRC1.VGPRS. + uint16_t workitem_vgpr_count; + + /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed VGPR number reserved. + uint16_t reserved_vgpr_first; + + /// The number of consecutive VGPRs reserved by the client. If + /// is_debug_supported then this count includes VGPRs reserved + /// for debugger use. + uint16_t reserved_vgpr_count; + + /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed SGPR number reserved. + uint16_t reserved_sgpr_first; + + /// The number of consecutive SGPRs reserved by the client. If + /// is_debug_supported then this count includes SGPRs reserved + /// for debugger use. + uint16_t reserved_sgpr_count; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number used to hold the wave scratch offset for the + /// entire kernel execution, or uint16_t(-1) if the register is not + /// used or not known. + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number of the first of 4 SGPRs used to hold the + /// scratch V# used for the entire kernel execution, or uint16_t(-1) + /// if the registers are not used or not known. + uint16_t debug_private_segment_buffer_sgpr; + + /// The maximum byte alignment of variables used by the kernel in + /// the specified memory segment. Expressed as a power of two. Must + /// be at least HSA_POWERTWO_16. + hsa_powertwo8_t kernarg_segment_alignment; + hsa_powertwo8_t group_segment_alignment; + hsa_powertwo8_t private_segment_alignment; + + uint8_t reserved3; + + /// Type of code object. + hsa_ext_code_kind32_t code_type; + + /// Reserved for code properties if any are defined in the future. + /// There are currently no code properties so this field must be 0. + uint32_t reserved4; + + /// Wavefront size expressed as a power of two. Must be a power of 2 + /// in range 1..64 inclusive. Used to support runtime query that + /// obtains wavefront size, which may be used by application to + /// allocated dynamic group memory and set the dispatch work-group + /// size. + hsa_powertwo8_t wavefront_size; + + /// The optimization level specified when the kernel was + /// finalized. + uint8_t optimization_level; + + /// The HSAIL profile defines which features are used. This + /// information is from the HSAIL version directive. If this + /// amd_kernel_code_t is not generated from an HSAIL compilation + /// unit then must be 0. + hsa_ext_brig_profile8_t hsail_profile; + + /// The HSAIL machine model gives the address sizes used by the + /// code. This information is from the HSAIL version directive. If + /// not generated from an HSAIL compilation unit then must still + /// indicate for what machine mode the code is generated. + hsa_ext_brig_machine_model8_t hsail_machine_model; + + /// The HSAIL major version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_major; + + /// The HSAIL minor version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_minor; + + /// Reserved for HSAIL target options if any are defined in the + /// future. There are currently no target options so this field + /// must be 0. + uint16_t reserved5; + + /// Reserved. Must be 0. + uint16_t reserved6; + + /// The values should be the actually values used by the finalizer + /// in generating the code. This may be the union of values + /// specified as finalizer arguments and explicit HSAIL control + /// directives. If the finalizer chooses to ignore a control + /// directive, and not generate constrained code, then the control + /// directive should not be marked as enabled even though it was + /// present in the HSAIL or finalizer argument. The values are + /// intended to reflect the constraints that the code actually + /// requires to correctly execute, not the values that were + /// actually specified at finalize time. + hsa_ext_control_directives_t control_directive; + + /// The code can immediately follow the amd_kernel_code_t, or can + /// come after subsequent amd_kernel_code_t structs when there are + /// multiple kernels in the compilation unit. + +} amd_kernel_code_t; + +#endif // AMDKERNELCODET_H diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp new file mode 100644 index 0000000..80081d4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -0,0 +1,1380 @@ +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +struct OptionalOperand; + +class AMDGPUOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate, + Register, + Expression + } Kind; + + SMLoc StartLoc, EndLoc; + +public: + AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + MCContext *Ctx; + + enum ImmTy { + ImmTyNone, + ImmTyDSOffset0, + ImmTyDSOffset1, + ImmTyGDS, + ImmTyOffset, + ImmTyGLC, + ImmTySLC, + ImmTyTFE, + ImmTyClamp, + ImmTyOMod + }; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + bool IsFPImm; + ImmTy Type; + int64_t Val; + }; + + struct RegOp { + unsigned RegNo; + int Modifiers; + const MCRegisterInfo *TRI; + bool IsForcedVOP3; + }; + + union { + TokOp Tok; + ImmOp Imm; + RegOp Reg; + const MCExpr *Expr; + }; + + void addImmOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm(getImm())); + } + + StringRef getToken() const { + return StringRef(Tok.Data, Tok.Length); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isReg()) + addRegOperands(Inst, N); + else + addImmOperands(Inst, N); + } + + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm( + Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); + addRegOperands(Inst, N); + } + + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } + } + + bool defaultTokenHasSuffix() const { + StringRef Token(Tok.Data, Tok.Length); + + return Token.endswith("_e32") || Token.endswith("_e64"); + } + + bool isToken() const override { + return Kind == Token; + } + + bool isImm() const override { + return Kind == Immediate; + } + + bool isInlineImm() const { + float F = BitsToFloat(Imm.Val); + // TODO: Add 0.5pi for VI + return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + } + + bool isDSOffset0() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset0; + } + + bool isDSOffset1() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset1; + } + + int64_t getImm() const { + return Imm.Val; + } + + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; + } + + bool isRegKind() const { + return Kind == Register; + } + + bool isReg() const override { + return Kind == Register && Reg.Modifiers == -1; + } + + bool isRegWithInputMods() const { + return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + } + + void setModifiers(unsigned Mods) { + assert(isReg()); + Reg.Modifiers = Mods; + } + + bool hasModifiers() const { + assert(isRegKind()); + return Reg.Modifiers != -1; + } + + unsigned getReg() const override { + return Reg.RegNo; + } + + bool isRegOrImm() const { + return isReg() || isImm(); + } + + bool isRegClass(unsigned RCID) const { + return Reg.TRI->getRegClass(RCID).contains(getReg()); + } + + bool isSCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc64() const { + return isImm() || isInlineImm() || + (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + } + + bool isVCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVCSrc64() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isVSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVSrc64() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isMem() const override { + return false; + } + + bool isExpr() const { + return Kind == Expression; + } + + bool isSoppBrTarget() const { + return isExpr() || isImm(); + } + + SMLoc getStartLoc() const override { + return StartLoc; + } + + SMLoc getEndLoc() const override { + return EndLoc; + } + + void print(raw_ostream &OS) const override { } + + static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { + auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); + Op->Imm.Val = Val; + Op->Imm.IsFPImm = IsFPImm; + Op->Imm.Type = Type; + Op->StartLoc = Loc; + Op->EndLoc = Loc; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { + auto Res = llvm::make_unique<AMDGPUOperand>(Token); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + Res->StartLoc = Loc; + Res->EndLoc = Loc; + return Res; + } + + static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + bool ForceVOP3) { + auto Op = llvm::make_unique<AMDGPUOperand>(Register); + Op->Reg.RegNo = RegNo; + Op->Reg.TRI = TRI; + Op->Reg.Modifiers = -1; + Op->Reg.IsForcedVOP3 = ForceVOP3; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) { + auto Op = llvm::make_unique<AMDGPUOperand>(Expression); + Op->Expr = Expr; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + bool isDSOffset() const; + bool isDSOffset01() const; + bool isSWaitCnt() const; + bool isMubufOffset() const; +}; + +class AMDGPUAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + const MCInstrInfo &MII; + MCAsmParser &Parser; + + unsigned ForcedEncodingSize; + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +public: + AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0){ + + if (STI.getFeatureBits().none()) { + // Set default features. + STI.ToggleFeature("SOUTHERN_ISLANDS"); + } + + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + unsigned getForcedEncodingSize() const { + return ForcedEncodingSize; + } + + void setForcedEncodingSize(unsigned Size) { + ForcedEncodingSize = Size; + } + + bool isForcedVOP3() const { + return ForcedEncodingSize == 64; + } + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, + OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseOptionalOps( + const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands); + + + void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + + OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); + void cvtFlat(MCInst &Inst, const OperandVector &Operands); + + void cvtMubuf(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseOffset(OperandVector &Operands); + OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseGLC(OperandVector &Operands); + OperandMatchResultTy parseSLC(OperandVector &Operands); + OperandMatchResultTy parseTFE(OperandVector &Operands); + + OperandMatchResultTy parseDMask(OperandVector &Operands); + OperandMatchResultTy parseUNorm(OperandVector &Operands); + OperandMatchResultTy parseR128(OperandVector &Operands); + + void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); +}; + +struct OptionalOperand { + const char *Name; + AMDGPUOperand::ImmTy Type; + bool IsBit; + int64_t Default; + bool (*ConvertResult)(int64_t&); +}; + +} // namespace + +static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { + if (IsVgpr) { + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::VGPR_32RegClassID; + case 2: return AMDGPU::VReg_64RegClassID; + case 3: return AMDGPU::VReg_96RegClassID; + case 4: return AMDGPU::VReg_128RegClassID; + case 8: return AMDGPU::VReg_256RegClassID; + case 16: return AMDGPU::VReg_512RegClassID; + } + } + + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } +} + +static unsigned getRegForName(const StringRef &RegName) { + + return StringSwitch<unsigned>(RegName) + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("m0", AMDGPU::M0) + .Case("scc", AMDGPU::SCC) + .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Default(0); +} + +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + const AsmToken Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + const StringRef &RegName = Tok.getString(); + RegNo = getRegForName(RegName); + + if (RegNo) { + Parser.Lex(); + return false; + } + + // Match vgprs and sgprs + if (RegName[0] != 's' && RegName[0] != 'v') + return true; + + bool IsVgpr = RegName[0] == 'v'; + unsigned RegWidth; + unsigned RegIndexInClass; + if (RegName.size() > 1) { + // We have a 32-bit register + RegWidth = 1; + if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + return true; + Parser.Lex(); + } else { + // We have a register greater than 32-bits. + + int64_t RegLo, RegHi; + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegLo)) + return true; + + if (getLexer().isNot(AsmToken::Colon)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegHi)) + return true; + + if (getLexer().isNot(AsmToken::RBrac)) + return true; + + Parser.Lex(); + RegWidth = (RegHi - RegLo) + 1; + if (IsVgpr) { + // VGPR registers aren't aligned. + RegIndexInClass = RegLo; + } else { + // SGPR registers are aligned. Max alignment is 4 dwords. + RegIndexInClass = RegLo / std::min(RegWidth, 4u); + } + } + + const MCRegisterInfo *TRC = getContext().getRegisterInfo(); + unsigned RC = getRegClass(IsVgpr, RegWidth); + if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + return true; + RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); + return false; +} + +unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + + if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + return Match_InvalidOperand; + + return Match_Success; +} + + +bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction not supported on this GPU"); + + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) { + if (isForcedVOP3()) { + // If 64-bit encoding has been forced we can end up with no + // clamp or omod operands if none of the registers have modifiers, + // so we need to add these to the operand list. + AMDGPUOperand &LastOp = + ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + if (LastOp.isRegKind() || + (LastOp.isImm() && + LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { + SMLoc S = Parser.getTok().getLoc(); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyClamp)); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOMod)); + bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, + Out, ErrorInfo, + MatchingInlineAsm); + if (!Res) + return Res; + } + + } + return Error(IDLoc, "too few operands for instruction"); + } + + ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); + } + } + llvm_unreachable("Implement any new match types added!"); +} + +bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { + return true; +} + +static bool operandsHaveModifiers(const OperandVector &Operands) { + + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isRegKind() && Op.hasModifiers()) + return true; + if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || + Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + + // Try to parse with a custom parser + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + + // If we successfully parsed the operand or if there as an error parsing, + // we are done. + // + // If we are parsing after we reach EndOfStatement then this means we + // are appending default values to the Operands list. This is only done + // by custom parser, so we shouldn't continue on to the generic parsing. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || + getLexer().is(AsmToken::EndOfStatement)) + return ResTy; + + bool Negate = false, Abs = false; + if (getLexer().getKind()== AsmToken::Minus) { + Parser.Lex(); + Negate = true; + } + + if (getLexer().getKind() == AsmToken::Pipe) { + Parser.Lex(); + Abs = true; + } + + switch(getLexer().getKind()) { + case AsmToken::Integer: { + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + APInt IntVal32(32, IntVal); + if (IntVal32.getSExtValue() != IntVal) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; + } + + IntVal = IntVal32.getSExtValue(); + if (Negate) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + return MatchOperand_Success; + } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + + APFloat F((float)BitsToDouble(IntVal)); + if (Negate) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + return MatchOperand_Success; + } + case AsmToken::Identifier: { + SMLoc S, E; + unsigned RegNo; + if (!ParseRegister(RegNo, S, E)) { + + bool HasModifiers = operandsHaveModifiers(Operands); + unsigned Modifiers = 0; + + if (Negate) + Modifiers |= 0x1; + + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) + return MatchOperand_ParseFail; + Parser.Lex(); + Modifiers |= 0x2; + } + + if (Modifiers && !HasModifiers) { + // We are adding a modifier to src1 or src2 and previous sources + // don't have modifiers, so we need to go back and empty modifers + // for each previous source. + for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; + --PrevRegIdx) { + + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); + RegOp.setModifiers(0); + } + } + + + Operands.push_back(AMDGPUOperand::CreateReg( + RegNo, S, E, getContext().getRegisterInfo(), + isForcedVOP3())); + + if (HasModifiers || Modifiers) { + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); + RegOp.setModifiers(Modifiers); + + } + } else { + Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), + S)); + Parser.Lex(); + } + return MatchOperand_Success; + } + default: + return MatchOperand_NoMatch; + } +} + +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + + // Clear any forced encodings from the previous instruction. + setForcedEncodingSize(0); + + if (Name.endswith("_e64")) + setForcedEncodingSize(64); + else if (Name.endswith("_e32")) + setForcedEncodingSize(32); + + // Add the instruction mnemonic + Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + + while (!getLexer().is(AsmToken::EndOfStatement)) { + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + + // Eat the comma or space if there is one. + if (getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + switch (Res) { + case MatchOperand_Success: break; + case MatchOperand_ParseFail: return Error(getLexer().getLoc(), + "failed parsing operand."); + case MatchOperand_NoMatch: return Error(getLexer().getLoc(), + "not a valid operand."); + } + } + + // Once we reach end of statement, continue parsing so we can add default + // values for optional arguments. + AMDGPUAsmParser::OperandMatchResultTy Res; + while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Error(getLexer().getLoc(), "failed parsing operand."); + } + return false; +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default) { + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().is(AsmToken::EndOfStatement)) { + Int = Default; + return MatchOperand_Success; + } + + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Identifier: { + StringRef OffsetName = Parser.getTok().getString(); + if (!OffsetName.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + if (getParser().parseAbsoluteExpression(Int)) + return MatchOperand_ParseFail; + break; + } + } + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + + SMLoc S = Parser.getTok().getLoc(); + int64_t Offset = 0; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + if (Res != MatchOperand_Success) + return Res; + + Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + int64_t Bit = 0; + SMLoc S = Parser.getTok().getLoc(); + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + switch(getLexer().getKind()) { + case AsmToken::Identifier: { + StringRef Tok = Parser.getTok().getString(); + if (Tok == Name) { + Bit = 1; + Parser.Lex(); + } else if (Tok.startswith("no") && Tok.endswith(Name)) { + Bit = 0; + Parser.Lex(); + } else { + return MatchOperand_NoMatch; + } + break; + } + default: + return MatchOperand_NoMatch; + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); + return MatchOperand_Success; +} + +static bool operandsHasOptionalOp(const OperandVector &Operands, + const OptionalOperand &OOp) { + for (unsigned i = 0; i < Operands.size(); i++) { + const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); + if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || + (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) + return true; + + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps, + OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + for (const OptionalOperand &Op : OptionalOps) { + if (operandsHasOptionalOp(Operands, Op)) + continue; + AMDGPUAsmParser::OperandMatchResultTy Res; + int64_t Value; + if (Op.IsBit) { + Res = parseNamedBit(Op.Name, Operands, Op.Type); + if (Res == MatchOperand_NoMatch) + continue; + return Res; + } + + Res = parseIntWithPrefix(Op.Name, Value, Op.Default); + + if (Res == MatchOperand_NoMatch) + continue; + + if (Res != MatchOperand_Success) + return Res; + + if (Op.ConvertResult && !Op.ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); + return MatchOperand_Success; + } + return MatchOperand_NoMatch; +} + +//===----------------------------------------------------------------------===// +// ds +//===----------------------------------------------------------------------===// + +static const OptionalOperand DSOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +static const OptionalOperand DSOptionalOpsOff01 [] = { + {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, + {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOps, Operands); +} +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOpsOff01, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + AMDGPUAsmParser::OperandMatchResultTy Res = + parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); + if (Res == MatchOperand_NoMatch) { + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOffset)); + Res = MatchOperand_Success; + } + return Res; +} + +bool AMDGPUOperand::isDSOffset() const { + return isImm() && isUInt<16>(getImm()); +} + +bool AMDGPUOperand::isDSOffset01() const { + return isImm() && isUInt<8>(getImm()); +} + +void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, + const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; + unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + + ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 + ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + +void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + bool GDSOnly = false; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + if (Op.isToken() && Op.getToken() == "gds") { + GDSOnly = true; + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + + if (!GDSOnly) { + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + } + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { + StringRef CntName = Parser.getTok().getString(); + int64_t CntVal; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return true; + + if (getParser().parseAbsoluteExpression(CntVal)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + int CntShift; + int CntMask; + + if (CntName == "vmcnt") { + CntMask = 0xf; + CntShift = 0; + } else if (CntName == "expcnt") { + CntMask = 0x7; + CntShift = 4; + } else if (CntName == "lgkmcnt") { + CntMask = 0x7; + CntShift = 8; + } else { + return true; + } + + IntVal &= ~(CntMask << CntShift); + IntVal |= (CntVal << CntShift); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { + // Disable all counters by default. + // vmcnt [3:0] + // expcnt [6:4] + // lgkmcnt [10:8] + int64_t CntVal = 0x77f; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(CntVal)) + return MatchOperand_ParseFail; + break; + + case AsmToken::Identifier: + do { + if (parseCnt(CntVal)) + return MatchOperand_ParseFail; + } while(getLexer().isNot(AsmToken::EndOfStatement)); + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +//===----------------------------------------------------------------------===// +// sopp branch targets +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + switch (getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: { + int64_t Imm; + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + return MatchOperand_Success; + } + + case AsmToken::Identifier: + Operands.push_back(AMDGPUOperand::CreateExpr( + MCSymbolRefExpr::create(getContext().getOrCreateSymbol( + Parser.getTok().getString()), getContext()), S)); + Parser.Lex(); + return MatchOperand_Success; + } +} + +//===----------------------------------------------------------------------===// +// flat +//===----------------------------------------------------------------------===// + +static const OptionalOperand FlatOptionalOps [] = { + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +static const OptionalOperand FlatAtomicOptionalOps [] = { + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatAtomicOptionalOps, Operands); +} + +void AMDGPUAsmParser::cvtFlat(MCInst &Inst, + const OperandVector &Operands) { + std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle 'glc' token which is sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) + continue; + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + + } + + // flat atomic instructions don't have a glc argument. + if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + } + + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mubuf +//===----------------------------------------------------------------------===// + +static const OptionalOperand MubufOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { + return parseOptionalOps(MubufOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOffset(OperandVector &Operands) { + return parseIntWithPrefix("offset", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseGLC(OperandVector &Operands) { + return parseNamedBit("glc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSLC(OperandVector &Operands) { + return parseNamedBit("slc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseTFE(OperandVector &Operands) { + return parseNamedBit("tfe", Operands); +} + +bool AMDGPUOperand::isMubufOffset() const { + return isImm() && isUInt<12>(getImm()); +} + +void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, + const OperandVector &Operands) { + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + assert(OptionalIdx.size() == 4); + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mimg +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDMask(OperandVector &Operands) { + return parseIntWithPrefix("dmask", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { + return parseNamedBit("unorm", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseR128(OperandVector &Operands) { + return parseNamedBit("r128", Operands); +} + +//===----------------------------------------------------------------------===// +// vop3 +//===----------------------------------------------------------------------===// + +static bool ConvertOmodMul(int64_t &Mul) { + if (Mul != 1 && Mul != 2 && Mul != 4) + return false; + + Mul >>= 1; + return true; +} + +static bool ConvertOmodDiv(int64_t &Div) { + if (Div == 1) { + Div = 0; + return true; + } + + if (Div == 2) { + Div = 3; + return true; + } + + return false; +} + +static const OptionalOperand VOP3OptionalOps [] = { + {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, + {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, + {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +}; + +static bool isVOP3(OperandVector &Operands) { + if (operandsHaveModifiers(Operands)) + return true; + + AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); + + if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) + return true; + + if (Operands.size() >= 5) + return true; + + if (Operands.size() > 3) { + AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); + if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || + Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { + + // The value returned by this function may change after parsing + // an operand so store the original value here. + bool HasModifiers = operandsHaveModifiers(Operands); + + bool IsVOP3 = isVOP3(Operands); + if (HasModifiers || IsVOP3 || + getLexer().isNot(AsmToken::EndOfStatement) || + getForcedEncodingSize() == 64) { + + AMDGPUAsmParser::OperandMatchResultTy Res = + parseOptionalOps(VOP3OptionalOps, Operands); + + if (!HasModifiers && Res == MatchOperand_Success) { + // We have added a modifier operation, so we need to make sure all + // previous register operands have modifiers + for (unsigned i = 2, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isReg()) + Op.setModifiers(0); + } + } + return Res; + } + return MatchOperand_NoMatch; +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + unsigned i = 2; + + std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + + if (operandsHaveModifiers(Operands)) { + for (unsigned e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + if (Op.isRegWithInputMods()) { + ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); + continue; + } + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; + unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; + + ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); + } else { + for (unsigned e = Operands.size(); i != e; ++i) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + } +} + +/// Force static initialization. +extern "C" void LLVMInitializeAMDGPUAsmParser() { + RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); + RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "AMDGPUGenAsmMatcher.inc" + diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td new file mode 100644 index 0000000..2f5fdbe --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -0,0 +1,149 @@ +//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// + + +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isCIVI in { + +defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", + VOP_F64_F64, ftrunc +>; +defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", + VOP_F64_F64, fceil +>; +defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", + VOP_F64_F64, ffloor +>; +defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", + VOP_F64_F64, frint +>; +defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", + VOP_F32_F32 +>; +defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", + VOP_F32_F32 +>; + +//===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; +def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; +def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; +def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x1d, "flat_store_dwordx2", VReg_64 +>; +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x1e, "flat_store_dwordx4", VReg_128 +>; +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x1f, "flat_store_dwordx3", VReg_96 +>; +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < + 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; +defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < + 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; +defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; + +} // End SubtargetPredicate = isCIVI + +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [HasFlatAddressSpace] in { + +class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt, + PatFrag flat_ld> : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr, 0, 0, 0) +>; + +def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>; + +class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr, 0, 0, 0) + >; + +def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>; +def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>; +def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; + +} // End HasFlatAddressSpace predicate + diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td new file mode 100644 index 0000000..ba4df82 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -0,0 +1,226 @@ +//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available only on Cayman +// family GPUs. +// +//===----------------------------------------------------------------------===// + +def isCayman : Predicate<"Subtarget->hasCaymanISA()">; + +//===----------------------------------------------------------------------===// +// Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isCayman] in { + +def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", + [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; +def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU +>; + +def : IMad24Pat<MULADD_INT24_cm>; + +let isVector = 1 in { + +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; + +def MULLO_INT_cm : MULLO_INT_Common<0x8F>; +def MULHI_INT_cm : MULHI_INT_Common<0x90>; +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; +def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; +def SIN_cm : SIN_Common<0x8D>; +def COS_cm : COS_Common<0x8E>; +} // End isVector = 1 + +def : RsqPat<RECIPSQRT_IEEE_cm, f32>; + +def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; + +defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; +defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>; + +// RECIP_UINT emulation for Cayman +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), + (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) +>; + + def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { + let ADDR = 0; + let POP_COUNT = 0; + let COUNT = 0; + } + + +def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; + +class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> : + CF_MEM_RAT_CACHELESS <0x14, 0, mask, + (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), + "STORE_DWORD $rw_gpr, $index_gpr", + [(global_store vt:$rw_gpr, i32:$index_gpr)]> { + let eop = 0; // This bit is not used on Cayman. +} + +def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>; +def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>; +def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>; + +class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + let SRC_SEL_Y = 0; + let STRUCTURED_READ = 0; + let LDS_REQ = 0; + let COALESCED_READ = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 + // %T2_X<def> = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// +def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End isCayman + diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td new file mode 100644 index 0000000..7adcd46 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -0,0 +1,670 @@ +//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to Evergreen and newer VLIW4/VLIW5 GPUs +// - Available only on Evergreen family GPUs. +// +//===----------------------------------------------------------------------===// + +def isEG : Predicate< + "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " + "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "!Subtarget->hasCaymanISA()" +>; + +def isEGorCayman : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" + "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" +>; + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman store instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins, + string name, list<dag> pattern> + : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, + "MEM_RAT_CACHELESS "#name, pattern>; + +class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, + list<dag> pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, + "MEM_RAT "#name, pattern>; + +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + "MSKOR $rw_gpr.XW, $index_gpr", + [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] +> { + let eop = 0; +} + +} // End let Predicates = [isEGorCayman] + +//===----------------------------------------------------------------------===// +// Evergreen Only instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEG] in { + +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; +defm DIV_eg : DIV_Common<RECIP_IEEE_eg>; + +def MULLO_INT_eg : MULLO_INT_Common<0x8F>; +def MULHI_INT_eg : MULHI_INT_Common<0x90>; +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +def : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def SIN_eg : SIN_Common<0x8D>; +def COS_eg : COS_Common<0x8E>; + +def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; +def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; + +defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>; + +//===----------------------------------------------------------------------===// +// Memory read/write instructions +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1 in { + +// 32-bit store +def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr, $index_gpr, $eop", + [(global_store i32:$rw_gpr, i32:$index_gpr)] +>; + +// 64-bit store +def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", + [(global_store v2i32:$rw_gpr, i32:$index_gpr)] +>; + +//128-bit store +def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", + [(global_store v4i32:$rw_gpr, i32:$index_gpr)] +>; + +} // End usesCustomInserter = 1 + +class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 1; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let MEGA_FETCH_COUNT = 2; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 4; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 + // %T2_X<def> = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 8; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// + +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End Predicates = [isEG] + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +// Should be predicated on FeatureFP64 +// def FMA_64 : R600_3OP < +// 0xA, "FMA_64", +// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] +// >; + +// BFE_UINT - bit_extract, an optimization for mask and shift +// Src0 = Input +// Src1 = Offset +// Src2 = Width +// +// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) +// +// Example Usage: +// (Offset, Width) +// +// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 +// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 +// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 +// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 +def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", + [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", + [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; + +def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", + [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; +def : Pat<(i32 (sext_inreg i32:$src, i8)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; +def : Pat<(i32 (sext_inreg i32:$src, i16)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; + +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>; + +def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], + VecALU +>; + +def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", + [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; + +def : UMad24Pat<MULADD_UINT24_eg>; + +def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; +def : ROTRPattern <BIT_ALIGN_INT_eg>; +def MULADD_eg : MULADD_Common<0x14>; +def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; +def FMA_eg : FMA_Common<0x7>; +def ASHR_eg : ASHR_Common<0x15>; +def LSHR_eg : LSHR_Common<0x16>; +def LSHL_eg : LSHL_Common<0x17>; +def CNDE_eg : CNDE_Common<0x19>; +def CNDGT_eg : CNDGT_Common<0x1A>; +def CNDGE_eg : CNDGE_Common<0x1B>; +def MUL_LIT_eg : MUL_LIT_Common<0x1F>; +def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; +def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU +>; +def DOT4_eg : DOT4_Common<0xBE>; +defm CUBE_eg : CUBE_Common<0xC0>; + +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; + +def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; +def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; + +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; + +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; +} + +def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; + +def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { + let Pattern = []; + let Itinerary = AnyALU; +} + +def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; + +def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { + let Pattern = []; +} + +def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; + +def GROUP_BARRIER : InstR600 < + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + R600ALU_Word0, + R600ALU_Word1_OP2 <0x54> { + + let dst = 0; + let dst_rel = 0; + let src0 = 0; + let src0_rel = 0; + let src0_neg = 0; + let src0_abs = 0; + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let write = 0; + let omod = 0; + let clamp = 0; + let last = 1; + let bank_swizzle = 0; + let pred_sel = 0; + let update_exec_mask = 0; + let update_pred = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; +} + +def : Pat < + (int_AMDGPU_barrier_global), + (GROUP_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// LDS Instructions +//===----------------------------------------------------------------------===// +class R600_LDS <bits<6> op, dag outs, dag ins, string asm, + list<dag> pattern = []> : + + InstR600 <outs, ins, asm, pattern, XALU>, + R600_ALU_LDS_Word0, + R600LDS_Word1 { + + bits<6> offset = 0; + let lds_op = op; + + let Word1{27} = offset{0}; + let Word1{12} = offset{1}; + let Word1{28} = offset{2}; + let Word1{31} = offset{3}; + let Word0{12} = offset{4}; + let Word0{25} = offset{5}; + + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; + let HasNativeOperands = 1; + let UseNamedOperandTable = 1; +} + +class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < + lds_op, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last OQAP, $src0$src0_rel $pred_sel", + pattern + > { + + let src1 = 0; + let src1_rel = 0; + let src2 = 0; + let src2_rel = 0; + + let usesCustomInserter = 1; + let LDS_1A = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", + pattern + > { + + field string BaseOp; + + let src2 = 0; + let src2_rel = 0; + let LDS_1A1D = 1; +} + +class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), + " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", + pattern> { + + field string BaseOp; + + let LDS_1A1D = 0; + let LDS_1A2D = 1; +} + +class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; +def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; +def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; +def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; +def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; +def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; +def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; +def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; +def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; +def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; +def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; +def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", + [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] +>; +def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", + [(truncstorei8_local i32:$src1, i32:$src0)] +>; +def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", + [(truncstorei16_local i32:$src1, i32:$src0)] +>; +def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", + [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] +>; +def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", + [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] +>; +def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", + [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] +>; +def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", + [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] +>; +def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", + [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", + [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", + [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", + [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", + [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] +>; +def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", + [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] +>; +def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", + [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] +>; +def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", + [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] +>; +def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", + [(set i32:$dst, (sextloadi8_local i32:$src0))] +>; +def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET", + [(set i32:$dst, (az_extloadi8_local i32:$src0))] +>; +def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET", + [(set i32:$dst, (sextloadi16_local i32:$src0))] +>; +def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", + [(set i32:$dst, (az_extloadi16_local i32:$src0))] +>; + +// TRUNC is used for the FLT_TO_INT instructions to work around a +// perceived problem where the rounding modes are applied differently +// depending on the instruction and the slot they are in. +// See: +// https://bugs.freedesktop.org/show_bug.cgi?id=50232 +// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c +// +// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, +// which do not need to be truncated since the fp values are 0.0f or 1.0f. +// We should look into handling these cases separately. +def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; + +def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; + +// SHA-256 Patterns +def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; + +def EG_ExportSwz : ExportSwzInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : ExportPattern<EG_ExportSwz, 83>; + +def EG_ExportBuf : ExportBufInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>; + +def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "PUSH @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; +} +def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; +} + +} // End Predicates = [isEGorCayman] diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp new file mode 100644 index 0000000..e811d5c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -0,0 +1,642 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + OS.flush(); + printInstruction(MI, OS); + + printAnnotation(OS, Annot); +} + +void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " offen"; +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " idxen"; +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " addr64"; +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " gds"; +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " glc"; +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " slc"; +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " tfe"; +} + +void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, + const MCRegisterInfo &MRI) { + switch (reg) { + case AMDGPU::VCC: + O << "vcc"; + return; + case AMDGPU::SCC: + O << "scc"; + return; + case AMDGPU::EXEC: + O << "exec"; + return; + case AMDGPU::M0: + O << "m0"; + return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; + default: + break; + } + + char Type; + unsigned NumRegs; + + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 3; + } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 16; + } else { + O << getRegisterName(reg); + return; + } + + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (NumRegs == 1) { + O << Type << RegIdx; + return; + } + + O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; +} + +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else + O << "_e32 "; + + printOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else + O << formatHex(static_cast<uint64_t>(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { + int64_t SImm = static_cast<int64_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else + llvm_unreachable("64-bit literal constants not supported"); +} + +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case AMDGPU::PRED_SEL_OFF: + break; + + default: + printRegOperand(Op.getReg(), O, MRI); + break; + } + } else if (Op.isImm()) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + if (RCID != -1) { + const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); + if (ImmRC.getSize() == 4) + printImmediate32(Op.getImm(), O); + else if (ImmRC.getSize() == 8) + printImmediate64(Op.getImm(), O); + else + llvm_unreachable("Invalid register class size"); + } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + printImmediate32(Op.getImm(), O); + } else { + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + // TODO: Eventually this should be unnecessary. + O << formatDec(Op.getImm()); + } + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); + + if (ImmRC.getSize() == 4) + printImmediate32(FloatToBits(Op.getFPImm()), O); + else if (ImmRC.getSize() == 8) + printImmediate64(DoubleToBits(Op.getFPImm()), O); + else + llvm_unreachable("Invalid register class size"); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + llvm_unreachable("unknown operand type in printOperand"); + } +} + +void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::NEG) + O << '-'; + if (InputModifiers & SISrcMods::ABS) + O << '|'; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & SISrcMods::ABS) + O << '|'; +} + +void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + + if (Imm == 2) { + O << "P0"; + } else if (Imm == 1) { + O << "P20"; + } else if (Imm == 0) { + O << "P10"; + } else { + llvm_unreachable("Invalid interpolation parameter slot"); + } +} + +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef Asm, + StringRef Default) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) { + O << Asm; + } else { + O << Default; + } +} + +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "|"); +} + +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "_SAT"); +} + +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; +} + +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "*", " "); +} + +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "-"); +} + +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "+"); +} + +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "Pred,"); +} + +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} + +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const char * chans = "XYZW"; + int sel = MI->getOperand(OpNo).getImm(); + + int chan = sel & 3; + sel >>= 2; + + if (sel >= 512) { + sel -= 512; + int cb = sel >> 12; + sel &= 4095; + O << cb << '[' << sel << ']'; + } else if (sel >= 448) { + sel -= 448; + O << sel; + } else if (sel >= 0){ + O << sel; + } + + if (sel >= 0) + O << '.' << chans[chan]; +} + +void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021/SCL_122"; + break; + case 2: + O << "BS:VEC_120/SCL_212"; + break; + case 3: + O << "BS:VEC_102/SCL_221"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } + return; +} + +void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Sel = MI->getOperand(OpNo).getImm(); + switch (Sel) { + case 0: + O << 'X'; + break; + case 1: + O << 'Y'; + break; + case 2: + O << 'Z'; + break; + case 3: + O << 'W'; + break; + case 4: + O << '0'; + break; + case 5: + O << '1'; + break; + case 7: + O << '_'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CT = MI->getOperand(OpNo).getImm(); + switch (CT) { + case 0: + O << 'U'; + break; + case 1: + O << 'N'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank << ':'; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; + } +} + +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Msg = SImm16 & 0xF; + if (Msg == 2 || Msg == 3) { + unsigned Op = (SImm16 >> 4) & 0xF; + if (Msg == 3) + O << "Gs_done("; + else + O << "Gs("; + if (Op == 0) { + O << "nop"; + } else { + unsigned Stream = (SImm16 >> 8) & 0x3; + if (Op == 1) + O << "cut"; + else if (Op == 2) + O << "emit"; + else if (Op == 3) + O << "emit-cut"; + O << " stream " << Stream; + } + O << "), [m0] "; + } else if (Msg == 1) + O << "interrupt "; + else if (Msg == 15) + O << "system "; + else + O << "unknown(" << Msg << ") "; +} + +void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs + // SIInsertWaits.cpp bits usage does not match ISA docs description but it + // works so it might be a misprint in docs. + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Vmcnt = SImm16 & 0xF; + unsigned Expcnt = (SImm16 >> 4) & 0xF; + unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; + + bool NeedSpace = false; + + if (Vmcnt != 0xF) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } +} + +#include "AMDGPUGenAsmWriter.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h new file mode 100644 index 0000000..14fb511 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -0,0 +1,88 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class AMDGPUInstPrinter : public MCInstPrinter { +public: + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + //Autogenerated by tblgen + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + static void printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI); + +private: + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printImmediate32(uint32_t I, raw_ostream &O); + void printImmediate64(uint64_t I, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef Asm, StringRef Default = ""); + static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp new file mode 100644 index 0000000..8bed2de --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -0,0 +1,144 @@ +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +namespace { + +class AMDGPUMCObjectWriter : public MCObjectWriter { +public: + AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} + void executePostLayoutBinding(MCAssembler &Asm, + const MCAsmLayout &Layout) override { + //XXX: Implement if necessary. + } + void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, bool &IsPCRel, + uint64_t &FixedValue) override { + assert(!"Not implemented"); + } + + void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + +}; + +class AMDGPUAsmBackend : public MCAsmBackend { +public: + AMDGPUAsmBackend(const Target &T) + : MCAsmBackend() {} + + unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + return false; + } + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + assert(!"Not implemented"); + } + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; +}; + +} //End anonymous namespace + +void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, + const MCAsmLayout &Layout) { + for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { + Asm.writeSectionData(&*I, Layout); + } +} + +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("Unknown fixup kind"); + case AMDGPU::fixup_si_sopp_br: { + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + *Dst = (Value - 4) / 4; + break; + } + + case AMDGPU::fixup_si_rodata: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + *Dst = Value; + break; + } + + case AMDGPU::fixup_si_end_of_text: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + // The value points to the last instruction in the text section, so we + // need to add 4 bytes to get to the start of the constants. + *Dst = Value + 4; + break; + } + } +} + +const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( + MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_si_rodata", 0, 32, 0 }, + { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + return Infos[Kind - FirstTargetFixupKind]; +} + +bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + OW->WriteZeros(Count); + + return true; +} + +//===----------------------------------------------------------------------===// +// ELFAMDGPUAsmBackend class +//===----------------------------------------------------------------------===// + +namespace { + +class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { +public: + ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAMDGPUELFObjectWriter(OS); + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return new ELFAMDGPUAsmBackend(T); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp new file mode 100644 index 0000000..59f45ff --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -0,0 +1,39 @@ +//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" + +using namespace llvm; + +namespace { + +class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { +public: + AMDGPUELFObjectWriter(); +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override { + return Fixup.getKind(); + } + +}; + + +} // End anonymous namespace + +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() + : MCELFObjectTargetWriter(false, 0, 0, false) { } + +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) { + MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); + return createELFObjectWriter(MOTW, OS, true); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h new file mode 100644 index 0000000..fa3b3c3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -0,0 +1,34 @@ +//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AMDGPU { +enum Fixups { + /// 16-bit PC relative fixup for SOPP branch instructions. + fixup_si_sopp_br = FirstTargetFixupKind, + + /// fixup for global addresses with constant initializers + fixup_si_rodata, + + /// fixup for offset from instruction to end of text section + fixup_si_end_of_text, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} // namespace AMDGPU +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp new file mode 100644 index 0000000..028a86d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -0,0 +1,43 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCAsmInfo.h" + +using namespace llvm; +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { + HasSingleParameterDotFile = false; + //===------------------------------------------------------------------===// + MaxInstLength = 16; + SeparatorString = "\n"; + CommentString = ";"; + PrivateLabelPrefix = ""; + InlineAsmStart = ";#ASMSTART"; + InlineAsmEnd = ";#ASMEND"; + + //===--- Data Emission Directives -------------------------------------===// + ZeroDirective = ".zero"; + AsciiDirective = ".ascii\t"; + AscizDirective = ".asciz\t"; + Data8bitsDirective = ".byte\t"; + Data16bitsDirective = ".short\t"; + Data32bitsDirective = ".long\t"; + Data64bitsDirective = ".quad\t"; + SunStyleELFSectionSwitchSyntax = true; + UsesELFSectionDirectiveForBSS = true; + + //===--- Global Variable Emission Directives --------------------------===// + HasAggressiveSymbolFolding = true; + COMMDirectiveAlignmentIsInBytes = false; + HasDotTypeDotSizeDirective = false; + HasNoDeadStrip = true; + WeakRefDirective = ".weakref\t"; + //===--- Dwarf Emission Directives -----------------------------------===// + SupportsDebugInformation = true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h new file mode 100644 index 0000000..a5bac51 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -0,0 +1,32 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H + +#include "llvm/MC/MCAsmInfoELF.h" +namespace llvm { + +class Triple; + +// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, +// you will need to make sure your new class sets PrivateGlobalPrefix to +// a prefix that won't appeary in a fuction name. The default value +// for PrivateGlobalPrefix is 'L', so it will consider any function starting +// with 'L' as a local symbol. +class AMDGPUMCAsmInfo : public MCAsmInfoELF { +public: + explicit AMDGPUMCAsmInfo(const Triple &TT); +}; +} // namespace llvm +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp new file mode 100644 index 0000000..521b3b3 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -0,0 +1,21 @@ +//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCCodeEmitter.h" + +using namespace llvm; + +// pin vtable to this file +void AMDGPUMCCodeEmitter::anchor() {} + diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h new file mode 100644 index 0000000..c957427 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -0,0 +1,50 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H + +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCInst; +class MCOperand; +class MCSubtargetInfo; + +class AMDGPUMCCodeEmitter : public MCCodeEmitter { + virtual void anchor(); +public: + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp new file mode 100644 index 0000000..a7d3dd1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -0,0 +1,90 @@ +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "AMDGPUMCAsmInfo.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AMDGPUGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AMDGPUGenRegisterInfo.inc" + +static MCInstrInfo *createAMDGPUMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAMDGPUMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo * +createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + MCSubtargetInfo * X = new MCSubtargetInfo(); + InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + return new AMDGPUInstPrinter(MAI, MII, MRI); +} + +extern "C" void LLVMInitializeAMDGPUTargetMC() { + for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); + + TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + } + + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + createR600MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h new file mode 100644 index 0000000..ac611b8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -0,0 +1,61 @@ +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class Triple; +class raw_pwrite_stream; +class raw_ostream; + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS); +} // namespace llvm + +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp new file mode 100644 index 0000000..e683498 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -0,0 +1,181 @@ +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// \brief The R600 code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "R600Defines.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + void operator=(const R600MCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + +public: + + R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) + : MCII(mcii), MRI(mri) { } + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; +private: + + void EmitByte(unsigned int byte, raw_ostream &OS) const; + + void Emit(uint32_t value, raw_ostream &OS) const; + void Emit(uint64_t value, raw_ostream &OS) const; + + unsigned getHWRegChan(unsigned reg) const; + unsigned getHWReg(unsigned regNo) const; + +}; + +} // End anonymous namespace + +enum RegElement { + ELEMENT_X = 0, + ELEMENT_Y, + ELEMENT_Z, + ELEMENT_W +}; + +enum FCInstr { + FC_IF_PREDICATE = 0, + FC_ELSE, + FC_ENDIF, + FC_BGNLOOP, + FC_ENDLOOP, + FC_BREAK_PREDICATE, + FC_CONTINUE +}; + +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new R600MCCodeEmitter(MCII, MRI); +} + +void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::FETCH_CLAUSE || + MI.getOpcode() == AMDGPU::ALU_CLAUSE || + MI.getOpcode() == AMDGPU::BUNDLE || + MI.getOpcode() == AMDGPU::KILL) { + return; + } else if (IS_VTX(Desc)) { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { + InstWord2 |= 1 << 19; // Mega-Fetch bit + } + + Emit(InstWord01, OS); + Emit(InstWord2, OS); + Emit((uint32_t) 0, OS); + } else if (IS_TEX(Desc)) { + int64_t Sampler = MI.getOperand(14).getImm(); + + int64_t SrcSelect[4] = { + MI.getOperand(2).getImm(), + MI.getOperand(3).getImm(), + MI.getOperand(4).getImm(), + MI.getOperand(5).getImm() + }; + int64_t Offsets[3] = { + MI.getOperand(6).getImm() & 0x1F, + MI.getOperand(7).getImm() & 0x1F, + MI.getOperand(8).getImm() & 0x1F + }; + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | + Offsets[2] << 10; + + Emit(Word01, OS); + Emit(Word2, OS); + Emit((uint32_t) 0, OS); + } else { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); + if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && + ((Desc.TSFlags & R600_InstFlag::OP1) || + Desc.TSFlags & R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst & (0x3FFULL << 39); + Inst &= ~(0x3FFULL << 39); + Inst |= ISAOpCode << 1; + } + Emit(Inst, OS); + } +} + +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { + OS.write((uint8_t) Byte & 0xff); +} + +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { + support::endian::Writer<support::little>(OS).write(Value); +} + +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { + support::endian::Writer<support::little>(OS).write(Value); +} + +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { + return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { + return MRI.getEncodingValue(RegNo) & HW_REG_MASK; +} + +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixup, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) + return MRI.getEncodingValue(MO.getReg()); + return getHWReg(MO.getReg()); + } + + assert(MO.isImm()); + return MO.getImm(); +} + +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp new file mode 100644 index 0000000..65a0eeb --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -0,0 +1,289 @@ +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The SI code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + void operator=(const SIMCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + MCContext &Ctx; + + /// \brief Can this operand also contain immediate values? + bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; + + /// \brief Encode an fp or int literal + uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; + +public: + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + MCContext &ctx) + : MCII(mcii), MRI(mri), Ctx(ctx) { } + + ~SIMCCodeEmitter() override {} + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; +}; + +} // End anonymous namespace + +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new SIMCCodeEmitter(MCII, MRI, Ctx); +} + +bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, + unsigned OpNo) const { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + + return OpType == AMDGPU::OPERAND_REG_IMM32 || + OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template <typename IntTy> +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; + + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); + + return 0; +} + +static uint32_t getLit32Encoding(uint32_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == FloatToBits(0.5f)) + return 240; + + if (Val == FloatToBits(-0.5f)) + return 241; + + if (Val == FloatToBits(1.0f)) + return 242; + + if (Val == FloatToBits(-1.0f)) + return 243; + + if (Val == FloatToBits(2.0f)) + return 244; + + if (Val == FloatToBits(-2.0f)) + return 245; + + if (Val == FloatToBits(4.0f)) + return 246; + + if (Val == FloatToBits(-4.0f)) + return 247; + + return 255; +} + +static uint32_t getLit64Encoding(uint64_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == DoubleToBits(0.5)) + return 240; + + if (Val == DoubleToBits(-0.5)) + return 241; + + if (Val == DoubleToBits(1.0)) + return 242; + + if (Val == DoubleToBits(-1.0)) + return 243; + + if (Val == DoubleToBits(2.0)) + return 244; + + if (Val == DoubleToBits(-2.0)) + return 245; + + if (Val == DoubleToBits(4.0)) + return 246; + + if (Val == DoubleToBits(-4.0)) + return 247; + + return 255; +} + +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, + unsigned OpSize) const { + if (MO.isExpr()) + return 255; + + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + if (OpSize == 4) + return getLit32Encoding(static_cast<uint32_t>(MO.getImm())); + + assert(OpSize == 8); + + return getLit64Encoding(static_cast<uint64_t>(MO.getImm())); +} + +void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + unsigned bytes = Desc.getSize(); + + for (unsigned i = 0; i < bytes; i++) { + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + } + + if (bytes > 4) + return; + + // Check for additional literals in SRC0/1/2 (Op 1/2/3) + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + + // Check if this operand should be encoded as [SV]Src + if (!isSrcOperand(Desc, i)) + continue; + + int RCID = Desc.OpInfo[i].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + // Is this operand a literal immediate? + const MCOperand &Op = MI.getOperand(i); + if (getLitEncoding(Op, RC.getSize()) != 255) + continue; + + // Yes! Encode it + int64_t Imm = 0; + + if (Op.isImm()) + Imm = Op.getImm(); + else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); + + for (unsigned j = 0; j < 4; j++) { + OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); + } + + // Only one literal value allowed + break; + } +} + +unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + return getMachineOpValue(MI, MO, Fixups, STI); +} + +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return MRI.getEncodingValue(MO.getReg()); + + if (MO.isExpr()) { + const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); + MCFixupKind Kind; + const MCSymbol *Sym = + Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + + if (&Expr->getSymbol() == Sym) { + // Add the offset to the beginning of the constant values. + Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; + } else { + // This is used for constant data stored in .rodata. + Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; + } + Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); + } + + // Figure out the operand number, needed for isSrcOperand check + unsigned OpNo = 0; + for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { + if (&MO == &MI.getOperand(OpNo)) + break; + } + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (isSrcOperand(Desc, OpNo)) { + int RCID = Desc.OpInfo[OpNo].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + uint32_t Enc = getLitEncoding(MO, RC.getSize()); + if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + return Enc; + + } else if (MO.isImm()) + return MO.getImm(); + + llvm_unreachable("Encoding of this operand type is not supported yet."); + return 0; +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td new file mode 100644 index 0000000..c0ffede --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -0,0 +1,137 @@ +//===-- Processors.td - R600 Processor definitions ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> +: Processor<Name, itin, Features>; + +//===----------------------------------------------------------------------===// +// R600 +//===----------------------------------------------------------------------===// +def : Proc<"", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache]>; + +def : Proc<"r600", R600_VLIW5_Itin, + [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"r630", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rs880", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize16]>; + +def : Proc<"rv670", R600_VLIW5_Itin, + [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// R700 +//===----------------------------------------------------------------------===// + +def : Proc<"rv710", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv730", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv770", R600_VLIW5_Itin, + [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Evergreen +//===----------------------------------------------------------------------===// + +def : Proc<"cedar", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, + FeatureCFALUBug]>; + +def : Proc<"redwood", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, + FeatureCFALUBug]>; + +def : Proc<"sumo", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; + +def : Proc<"juniper", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"cypress", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureFP64, FeatureVertexCache, + FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Northern Islands +//===----------------------------------------------------------------------===// + +def : Proc<"barts", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"turks", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"caicos", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureCFALUBug]>; + +def : Proc<"cayman", R600_VLIW4_Itin, + [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; + +//===----------------------------------------------------------------------===// +// Southern Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +//===----------------------------------------------------------------------===// +// Sea Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp new file mode 100644 index 0000000..3cb9021 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -0,0 +1,206 @@ +//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// This pass is merging consecutive CFAlus where applicable. +/// It needs to be called after IfCvt for best results. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600mergeclause" + +namespace { + +static bool isCFAlu(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU: + case AMDGPU::CF_ALU_PUSH_BEFORE: + return true; + default: + return false; + } +} + +class R600ClauseMergePass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned getCFAluSize(const MachineInstr *MI) const; + bool isCFAluEnabled(const MachineInstr *MI) const; + + /// IfCvt pass can generate "disabled" ALU clause marker that need to be + /// removed and their content affected to the previous alu clause. + /// This function parse instructions after CFAlu until it find a disabled + /// CFAlu and merge the content, or an enabled CFAlu. + void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + + /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if + /// it is the case. + bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) + const; + +public: + R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override; +}; + +char R600ClauseMergePass::ID = 0; + +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); +} + +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); +} + +void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) + const { + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + I++; + do { + while (I!= E && !isCFAlu(I)) + I++; + if (I == E) + return; + MachineInstr *MI = I++; + if (isCFAluEnabled(MI)) + break; + CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI->eraseFromParent(); + } while (I != E); +} + +bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, + const MachineInstr *LatrCFAlu) const { + assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + unsigned RootInstCount = getCFAluSize(RootCFAlu), + LaterInstCount = getCFAluSize(LatrCFAlu); + unsigned CumuledInsts = RootInstCount + LaterInstCount; + if (CumuledInsts >= TII->getMaxAlusPerClause()) { + DEBUG(dbgs() << "Excess inst counts\n"); + return false; + } + if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return false; + // Is KCache Bank 0 compatible ? + int Mode0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + int KBank0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + int KBank0LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + if (LatrCFAlu->getOperand(Mode0Idx).getImm() && + RootCFAlu->getOperand(Mode0Idx).getImm() && + (LatrCFAlu->getOperand(KBank0Idx).getImm() != + RootCFAlu->getOperand(KBank0Idx).getImm() || + LatrCFAlu->getOperand(KBank0LineIdx).getImm() != + RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + // Is KCache Bank 1 compatible ? + int Mode1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + int KBank1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + int KBank1LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + if (LatrCFAlu->getOperand(Mode1Idx).getImm() && + RootCFAlu->getOperand(Mode1Idx).getImm() && + (LatrCFAlu->getOperand(KBank1Idx).getImm() != + RootCFAlu->getOperand(KBank1Idx).getImm() || + LatrCFAlu->getOperand(KBank1LineIdx).getImm() != + RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { + RootCFAlu->getOperand(Mode0Idx).setImm( + LatrCFAlu->getOperand(Mode0Idx).getImm()); + RootCFAlu->getOperand(KBank0Idx).setImm( + LatrCFAlu->getOperand(KBank0Idx).getImm()); + RootCFAlu->getOperand(KBank0LineIdx).setImm( + LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + } + if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { + RootCFAlu->getOperand(Mode1Idx).setImm( + LatrCFAlu->getOperand(Mode1Idx).getImm()); + RootCFAlu->getOperand(KBank1Idx).setImm( + LatrCFAlu->getOperand(KBank1Idx).getImm()); + RootCFAlu->getOperand(KBank1LineIdx).setImm( + LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + } + RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + return true; +} + +bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator LatestCFAlu = E; + while (I != E) { + MachineInstr *MI = I++; + if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || + TII->mustBeLastInClause(MI->getOpcode())) + LatestCFAlu = E; + if (!isCFAlu(MI)) + continue; + cleanPotentialDisabledCFAlu(MI); + + if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { + MI->eraseFromParent(); + } else { + assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + LatestCFAlu = MI; + } + } + } + return false; +} + +const char *R600ClauseMergePass::getPassName() const { + return "R600 Merge Clause Markers Pass"; +} + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { + return new R600ClauseMergePass(TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp new file mode 100644 index 0000000..c8f37f6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -0,0 +1,679 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600cf" + +namespace { + +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget *ST; + std::vector<StackItem> BranchStack; + std::vector<StackItem> LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST->hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST->getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST->getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + + static char ID; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + unsigned MaxFetchInst; + const AMDGPUSubtarget *ST; + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST->hasCaymanISA()) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set<unsigned> &DstRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + DstMI = Reg; + else + DstMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end())) { + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set<unsigned> DstRegs; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (AluInstCount >= MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs)) + break; + AluInstCount ++; + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, std::move(ClauseContent)); + } + + void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { + static const unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = + TII->getSrcs(MI); + for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { + if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + continue; + int64_t Imm = Srcs[i].second; + std::vector<int64_t>::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + Srcs[i].first->setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + Srcs[i].first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector<unsigned> &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) + break; + std::vector<int64_t> Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(BI, Literals); + ClauseContent.push_back(BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } + } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, std::move(ClauseContent)); + } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { + MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + } + void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { + CounterPropagateAddr(MI, Addr); + } + } + +public: + R600ControlFlowFinalizer(TargetMachine &tm) + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + ST = &MF.getSubtarget<AMDGPUSubtarget>(); + MaxFetchInst = ST->getTexVTXClauseSize(); + TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + CFStack CFStack(ST, MFI->getShaderType()); + for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; + ++MB) { + MachineBasicBlock &MBB = *MB; + unsigned CfCount = 0; + std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<MachineInstr * > IfThenElseStack; + if (MFI->getShaderType() == ShaderType::VERTEX) { + BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), + getHWInstrDesc(CF_CALL_FS)); + CfCount++; + } + std::vector<ClauseFile> FetchClauses, AluClauses; + std::vector<MachineInstr *> LastAlu(1); + std::vector<MachineInstr *> ToPopAfter; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + DEBUG(dbgs() << CfCount << ":"; I->dump();); + FetchClauses.push_back(MakeFetchClause(MBB, I)); + CfCount++; + LastAlu.back() = nullptr; + continue; + } + + MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = nullptr; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; + I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + .addImm(CfCount + 1) + .addImm(1); + MI->setDesc(TII->get(AMDGPU::CF_ALU)); + CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + + case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + break; + case AMDGPU::WHILELOOP: { + CFStack.pushLoop(); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); + std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::set<MachineInstr *>()); + Pair.second.insert(MIb); + LoopStack.push_back(std::move(Pair)); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDLOOP: { + CFStack.popLoop(); + std::pair<unsigned, std::set<MachineInstr *> > Pair = + std::move(LoopStack.back()); + LoopStack.pop_back(); + CounterPropagateAddr(Pair.second, CfCount); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) + .addImm(Pair.first + 1); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(nullptr); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_JUMP)) + .addImm(0) + .addImm(0); + IfThenElseStack.push_back(MIb); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ELSE: { + MachineInstr * JumpInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(JumpInst, CfCount); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_ELSE)) + .addImm(0) + .addImm(0); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + IfThenElseStack.push_back(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDIF: { + CFStack.popBranch(); + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + + MachineInstr *IfOrElseInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); + MI->eraseFromParent(); + break; + } + case AMDGPU::BREAK: { + CfCount ++; + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_BREAK)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + break; + } + case AMDGPU::CONTINUE: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_CONTINUE)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } + default: + if (TII->isExport(MI->getOpcode())) { + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + } + break; + } + } + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } + MFI->StackSize = CFStack.MaxStackSize; + } + + return false; + } + + const char *getPassName() const override { + return "R600 Control Flow Finalizer Pass"; + } +}; + +char R600ControlFlowFinalizer::ID = 0; + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { + return new R600ControlFlowFinalizer(TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h new file mode 100644 index 0000000..6ff0a22 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h @@ -0,0 +1,171 @@ +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H +#define LLVM_LIB_TARGET_R600_R600DEFINES_H + +#include "llvm/MC/MCRegisterInfo.h" + +// Operand Flags +#define MO_FLAG_CLAMP (1 << 0) +#define MO_FLAG_NEG (1 << 1) +#define MO_FLAG_ABS (1 << 2) +#define MO_FLAG_MASK (1 << 3) +#define MO_FLAG_PUSH (1 << 4) +#define MO_FLAG_NOT_LAST (1 << 5) +#define MO_FLAG_LAST (1 << 6) +#define NUM_MO_FLAGS 7 + +/// \brief Helper for getting the operand index for the instruction flags +/// operand. +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) + +namespace R600_InstFlag { + enum TIF { + TRANS_ONLY = (1 << 0), + TEX = (1 << 1), + REDUCTION = (1 << 2), + FC = (1 << 3), + TRIG = (1 << 4), + OP3 = (1 << 5), + VECTOR = (1 << 6), + //FlagOperand bits 7, 8 + NATIVE_OPERANDS = (1 << 9), + OP1 = (1 << 10), + OP2 = (1 << 11), + VTX_INST = (1 << 12), + TEX_INST = (1 << 13), + ALU_INST = (1 << 14), + LDS_1A = (1 << 15), + LDS_1A1D = (1 << 16), + IS_EXPORT = (1 << 17), + LDS_1A2D = (1 << 18) + }; +} // namespace R600_InstFlag + +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) + +/// \brief Defines for extracting register information from register encoding +#define HW_REG_MASK 0x1ff +#define HW_CHAN_SHIFT 9 + +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + +#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) +#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) + +namespace OpName { + + enum VecOps { + UPDATE_EXEC_MASK_X, + UPDATE_PREDICATE_X, + WRITE_X, + OMOD_X, + DST_REL_X, + CLAMP_X, + SRC0_X, + SRC0_NEG_X, + SRC0_REL_X, + SRC0_ABS_X, + SRC0_SEL_X, + SRC1_X, + SRC1_NEG_X, + SRC1_REL_X, + SRC1_ABS_X, + SRC1_SEL_X, + PRED_SEL_X, + UPDATE_EXEC_MASK_Y, + UPDATE_PREDICATE_Y, + WRITE_Y, + OMOD_Y, + DST_REL_Y, + CLAMP_Y, + SRC0_Y, + SRC0_NEG_Y, + SRC0_REL_Y, + SRC0_ABS_Y, + SRC0_SEL_Y, + SRC1_Y, + SRC1_NEG_Y, + SRC1_REL_Y, + SRC1_ABS_Y, + SRC1_SEL_Y, + PRED_SEL_Y, + UPDATE_EXEC_MASK_Z, + UPDATE_PREDICATE_Z, + WRITE_Z, + OMOD_Z, + DST_REL_Z, + CLAMP_Z, + SRC0_Z, + SRC0_NEG_Z, + SRC0_REL_Z, + SRC0_ABS_Z, + SRC0_SEL_Z, + SRC1_Z, + SRC1_NEG_Z, + SRC1_REL_Z, + SRC1_ABS_Z, + SRC1_SEL_Z, + PRED_SEL_Z, + UPDATE_EXEC_MASK_W, + UPDATE_PREDICATE_W, + WRITE_W, + OMOD_W, + DST_REL_W, + CLAMP_W, + SRC0_W, + SRC0_NEG_W, + SRC0_REL_W, + SRC0_ABS_W, + SRC0_SEL_W, + SRC1_W, + SRC1_NEG_W, + SRC1_REL_W, + SRC1_ABS_W, + SRC1_SEL_W, + PRED_SEL_W, + IMM_0, + IMM_1, + VEC_COUNT + }; + +} // namespace OpName + +//===----------------------------------------------------------------------===// +// Config register definitions +//===----------------------------------------------------------------------===// + +#define R_02880C_DB_SHADER_CONTROL 0x02880C +#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) + +// These fields are the same for all shader types and families. +#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) +//===----------------------------------------------------------------------===// +// R600, R700 Registers +//===----------------------------------------------------------------------===// + +#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 +#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 + +//===----------------------------------------------------------------------===// +// Evergreen, Northern Islands Registers +//===----------------------------------------------------------------------===// + +#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 +#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 +#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 + +#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp new file mode 100644 index 0000000..fdc2030 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -0,0 +1,336 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace llvm { + void initializeR600EmitClauseMarkersPass(PassRegistry&); +} + +namespace { + +class R600EmitClauseMarkers : public MachineFunctionPass { + +private: + const R600InstrInfo *TII; + int Address; + + unsigned OccupiedDwords(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return 4; + case AMDGPU::KILL: + return 0; + default: + break; + } + + // These will be expanded to two ALU instructions in the + // ExpandSpecialInstructions pass. + if (TII->isLDSRetInstr(MI->getOpcode())) + return 2; + + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return 4; + + unsigned NumLiteral = 0; + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++NumLiteral; + } + return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { + if (TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } + } + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + case AMDGPU::IMPLICIT_DEF: + return true; + default: + return false; + } + } + + std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { + // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 + // (See also R600ISelLowering.cpp) + // ConstIndex value is in [0, 4095]; + return std::pair<unsigned, unsigned>( + ((Sel >> 2) - 512) >> 12, // KC_BANK + // Line Number of ConstIndex + // A line contains 16 constant registers however KCX bank can lock + // two line at the same time ; thus we want to get an even line number. + // Line number can be retrieved with (>>4), using (>>5) <<1 generates + // an even number. + ((((Sel >> 2) - 512) & 4095) >> 5) << 1); + } + + bool SubstituteKCacheBank(MachineInstr *MI, + std::vector<std::pair<unsigned, unsigned> > &CachedConsts, + bool UpdateInstr = true) const { + std::vector<std::pair<unsigned, unsigned> > UsedKCache; + + if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + return true; + + const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = + TII->getSrcs(MI); + assert((TII->isALUInstr(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + unsigned Sel = Consts[i].second; + unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; + unsigned KCacheIndex = Index * 4 + Chan; + const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); + if (CachedConsts.empty()) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts[0] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts.size() == 1) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + if (CachedConsts[1] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + return false; + } + + if (!UpdateInstr) + return true; + + for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + switch(UsedKCache[j].first) { + case 0: + Consts[i].first->setReg( + AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + break; + case 1: + Consts[i].first->setReg( + AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + break; + default: + llvm_unreachable("Wrong Cache Line"); + } + j++; + } + return true; + } + + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + std::vector<std::pair<unsigned, unsigned> > KCacheBanks, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + for (MachineInstr::const_mop_iterator + MOI = Def->operands_begin(), + MOE = Def->operands_end(); MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) + continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { + AluInstCount += OccupiedDwords(UseI); + // Make sure we won't need to end the clause due to KCache limitations. + if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + return false; + + // We have reached the maximum instruction limit before finding the + // use that kills this register, so we cannot use this def in the + // current clause. + if (AluInstCount >= TII->getMaxAlusPerClause()) + return false; + + // Register kill flags have been cleared by the time we get to this + // pass, but it is safe to assume that all uses of this register + // occur in the same basic block as its definition, because + // it is illegal for the scheduler to schedule them in + // different blocks. + if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + LastUseCount = AluInstCount; + + if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + break; + } + if (LastUseCount) + return LastUseCount <= TII->getMaxAlusPerClause(); + llvm_unreachable("Clause local register live at end of clause."); + } + return true; + } + + MachineBasicBlock::iterator + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<std::pair<unsigned, unsigned> > KCacheBanks; + bool PushBeforeModifier = false; + unsigned AluInstCount = 0; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (!isALU(I)) + break; + if (AluInstCount > TII->getMaxAlusPerClause()) + break; + if (I->getOpcode() == AMDGPU::PRED_X) { + // We put PRED_X in its own clause to ensure that ifcvt won't create + // clauses with more than 128 insts. + // IfCvt is indeed checking that "then" and "else" branches of an if + // statement have less than ~60 insts thus converted clauses can't be + // bigger than ~121 insts (predicate setter needs to be in the same + // clause as predicated alus). + if (AluInstCount > 0) + break; + if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + PushBeforeModifier = true; + AluInstCount ++; + continue; + } + // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: + // + // * KILL or INTERP instructions + // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits + // * Uses waterfalling (i.e. INDEX_MODE = AR.X) + // + // XXX: These checks have not been implemented yet. + if (TII->mustBeLastInClause(I->getOpcode())) { + I++; + break; + } + + // If this instruction defines a clause local register, make sure + // its use can fit in this clause. + if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) + break; + + if (!SubstituteKCacheBank(I, KCacheBanks)) + break; + AluInstCount += OccupiedDwords(I); + } + unsigned Opcode = PushBeforeModifier ? + AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) + // We don't use the ADDR field until R600ControlFlowFinalizer pass, where + // it is safe to assume it is 0. However if we always put 0 here, the ifcvt + // pass may assume that identical ALU clause starter at the beginning of a + // true and false branch can be factorized which is not the case. + .addImm(Address++) // ADDR + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 + .addImm(KCacheBanks.empty()?0:2) // KM0 + .addImm((KCacheBanks.size() < 2)?0:2) // KM1 + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 + .addImm(AluInstCount) // COUNT + .addImm(1); // Enabled + return I; + } + +public: + static char ID; + R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { + + initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + if (I->getOpcode() == AMDGPU::CF_ALU) + continue; // BB was already parsed + for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { + if (isALU(I)) + I = MakeALUClause(MBB, I); + else + ++I; + } + } + return false; + } + + const char *getPassName() const override { + return "R600 Emit Clause Markers Pass"; + } +}; + +char R600EmitClauseMarkers::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) +INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) + +llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { + return new R600EmitClauseMarkers(); +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp new file mode 100644 index 0000000..211d392e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -0,0 +1,349 @@ +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Vector, Reduction, and Cube instructions need to fill the entire instruction +/// group to work correctly. This pass expands these individual instructions +/// into several instructions that will completely fill the instruction group. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class R600ExpandSpecialInstrsPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, + unsigned Op); + +public: + R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "R600 Expand special instructions pass"; + } +}; + +} // End anonymous namespace + +char R600ExpandSpecialInstrsPass::ID = 0; + +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { + return new R600ExpandSpecialInstrsPass(TM); +} + +void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, + const MachineInstr *OldMI, unsigned Op) { + int OpIdx = TII->getOperandIdx(*OldMI, Op); + if (OpIdx > -1) { + uint64_t Val = OldMI->getOperand(OpIdx).getImm(); + TII->setImmOperand(NewMI, Op, Val); + } +} + +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + while (I != MBB.end()) { + MachineInstr &MI = *I; + I = std::next(I); + + // Expand LDS_*_RET instructions + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineOperand &DstOp = MI.getOperand(DstIdx); + MachineInstr *Mov = TII->buildMovInstr(&MBB, I, + DstOp.getReg(), AMDGPU::OQAP); + DstOp.setReg(AMDGPU::OQAP); + int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), + AMDGPU::OpName::pred_sel); + int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), + AMDGPU::OpName::pred_sel); + // Copy the pred_sel bit + Mov->getOperand(MovPredSelIdx).setReg( + MI.getOperand(LDSPredSelIdx).getReg()); + } + + switch (MI.getOpcode()) { + default: break; + // Expand PRED_X to one of the PRED_SET instructions. + case AMDGPU::PRED_X: { + uint64_t Flags = MI.getOperand(3).getImm(); + // The native opcode used by PRED_X is stored as an immediate in the + // third operand. + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, + MI.getOperand(2).getImm(), // opcode + MI.getOperand(0).getReg(), // dst + MI.getOperand(1).getReg(), // src0 + AMDGPU::ZERO); // src1 + TII->addFlag(PredSet, 0, MO_FLAG_MASK); + if (Flags & MO_FLAG_PUSH) { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); + } else { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); + } + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_XY: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = MI.getOperand(Chan).getReg(); + else + DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan >= 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_ZW: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; + else + DstReg = MI.getOperand(Chan-2).getReg(); + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan < 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_VEC_LOAD: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(1).getImm()); + unsigned DstReg = MI.getOperand(0).getReg(); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, + TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + case AMDGPU::DOT_4: { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Mask) { + TII->addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + unsigned Opcode = BMI->getOpcode(); + // While not strictly necessary from hw point of view, we force + // all src operands of a dot4 inst to belong to the same slot. + unsigned Src0 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + .getReg(); + unsigned Src1 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + .getReg(); + (void) Src0; + (void) Src1; + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && + (TRI.getEncodingValue(Src1) & 0xff) < 127) + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); + } + MI.eraseFromParent(); + continue; + } + } + + bool IsReduction = TII->isReductionOp(MI.getOpcode()); + bool IsVector = TII->isVector(MI); + bool IsCube = TII->isCubeOp(MI.getOpcode()); + if (!IsReduction && !IsVector && !IsCube) { + continue; + } + + // Expand the instruction + // + // Reduction instructions: + // T0_X = DP4 T1_XYZW, T2_XYZW + // becomes: + // TO_X = DP4 T1_X, T2_X + // TO_Y (write masked) = DP4 T1_Y, T2_Y + // TO_Z (write masked) = DP4 T1_Z, T2_Z + // TO_W (write masked) = DP4 T1_W, T2_W + // + // Vector instructions: + // T0_X = MULLO_INT T1_X, T2_X + // becomes: + // T0_X = MULLO_INT T1_X, T2_X + // T0_Y (write masked) = MULLO_INT T1_X, T2_X + // T0_Z (write masked) = MULLO_INT T1_X, T2_X + // T0_W (write masked) = MULLO_INT T1_X, T2_X + // + // Cube instructions: + // T0_XYZW = CUBE T1_XYZW + // becomes: + // TO_X = CUBE T1_Z, T1_Y + // T0_Y = CUBE T1_Z, T1_X + // T0_Z = CUBE T1_X, T1_Z + // T0_W = CUBE T1_Y, T1_Z + for (unsigned Chan = 0; Chan < 4; Chan++) { + unsigned DstReg = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); + unsigned Src0 = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); + unsigned Src1 = 0; + + // Determine the correct source registers + if (!IsCube) { + int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); + if (Src1Idx != -1) { + Src1 = MI.getOperand(Src1Idx).getReg(); + } + } + if (IsReduction) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + Src0 = TRI.getSubReg(Src0, SubRegIndex); + Src1 = TRI.getSubReg(Src1, SubRegIndex); + } else if (IsCube) { + static const int CubeSrcSwz[] = {2, 2, 0, 1}; + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + Src1 = TRI.getSubReg(Src0, SubRegIndex1); + Src0 = TRI.getSubReg(Src0, SubRegIndex0); + } + + // Determine the correct destination registers; + bool Mask = false; + bool NotLast = true; + if (IsCube) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + DstReg = TRI.getSubReg(DstReg, SubRegIndex); + } else { + // Mask the write if the original instruction does not write to + // the current Channel. + Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + } + + // Set the IsLast bit + NotLast = (Chan != 3 ); + + // Add the new instruction + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::CUBE_r600_pseudo: + Opcode = AMDGPU::CUBE_r600_real; + break; + case AMDGPU::CUBE_eg_pseudo: + Opcode = AMDGPU::CUBE_eg_real; + break; + default: + break; + } + + MachineInstr *NewMI = + TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); + + if (Chan != 0) + NewMI->bundleWithPred(); + if (Mask) { + TII->addFlag(NewMI, 0, MO_FLAG_MASK); + } + if (NotLast) { + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + } + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); + } + MI.eraseFromParent(); + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp new file mode 100644 index 0000000..8357b6d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -0,0 +1,2286 @@ +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for R600 +// +//===----------------------------------------------------------------------===// + +#include "R600ISelLowering.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +R600TargetLowering::R600TargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + // Set condition code actions + setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setCondCodeAction(ISD::SETLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + + setCondCodeAction(ISD::SETLE, MVT::i32, Expand); + setCondCodeAction(ISD::SETLT, MVT::i32, Expand); + setCondCodeAction(ISD::SETULE, MVT::i32, Expand); + setCondCodeAction(ISD::SETULT, MVT::i32, Expand); + + setOperationAction(ISD::FCOS, MVT::f32, Custom); + setOperationAction(ISD::FSIN, MVT::f32, Custom); + + setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v2i32, Expand); + + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + setOperationAction(ISD::FSUB, MVT::f32, Expand); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + + // ADD, SUB overflow. + // TODO: turn these into Legal? + if (Subtarget->hasCARRY()) + setOperationAction(ISD::UADDO, MVT::i32, Custom); + + if (Subtarget->hasBORROW()) + setOperationAction(ISD::USUBO, MVT::i32, Custom); + + // Expand sign extension of vectors + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + + + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + + // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 + // to be Legal/Custom in order to avoid library calls. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + + setSchedulingPreference(Sched::Source); +} + +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + MachineFunction * MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock::iterator I = *MI; + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + + switch (MI->getOpcode()) { + default: + // Replace LDS_*_RET instruction that don't have any uses with the + // equivalent LDS_*_NORET instruction. + if (TII->isLDSRetInstr(MI->getOpcode())) { + int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineInstrBuilder NewMI; + // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add + // LDS_1A2D support and remove this special case. + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || + MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + return BB; + + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI->getOperand(i)); + } + } else { + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + } + break; + case AMDGPU::CLAMP_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + break; + } + + case AMDGPU::FABS_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_ABS); + break; + } + + case AMDGPU::FNEG_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_NEG); + break; + } + + case AMDGPU::MASK_WRITE: { + unsigned maskedRegister = MI->getOperand(0).getReg(); + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); + TII->addFlag(defInstr, 0, MO_FLAG_MASK); + break; + } + + case AMDGPU::MOV_IMM_F32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getFPImm()->getValueAPF() + .bitcastToAPInt().getZExtValue()); + break; + case AMDGPU::MOV_IMM_I32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getImm()); + break; + case AMDGPU::CONST_COPY: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, + MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, + MI->getOperand(1).getImm()); + break; + } + + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(EOP); // Set End of program bit + break; + } + + case AMDGPU::TXD: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::TXD_SHADOW: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI->getOperand(0)); + break; + + case AMDGPU::BRANCH_COND_f32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::BRANCH_COND_i32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO_INT) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportSwz: { + // Instruction is left unmodified if its not the last one of its type + bool isLastInstructionOfItsType = true; + unsigned InstExportType = MI->getOperand(1).getImm(); + for (MachineBasicBlock::iterator NextExportInst = std::next(I), + EndBlock = BB->end(); NextExportInst != EndBlock; + NextExportInst = std::next(NextExportInst)) { + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + unsigned CurrentInstExportType = NextExportInst->getOperand(1) + .getImm(); + if (CurrentInstExportType == InstExportType) { + isLastInstructionOfItsType = false; + break; + } + } + } + bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + if (!EOP && !isLastInstructionOfItsType) + return BB; + unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addImm(CfInst) + .addImm(EOP); + break; + } + case AMDGPU::RETURN: { + // RETURN instructions must have the live-out registers as implicit uses, + // otherwise they appear dead. + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + MachineInstrBuilder MIB(*MF, MI); + for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) + MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); + return BB; + } + } + + MI->eraseFromParent(); + return BB; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); + case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); + case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); + case ISD::FCOS: + case ISD::FSIN: return LowerTrig(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::INTRINSIC_VOID: { + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntrinsicID) { + case AMDGPUIntrinsic::AMDGPU_store_output: { + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MFI->LiveOuts.push_back(Reg); + return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); + } + case AMDGPUIntrinsic::R600_store_swizzle: { + SDLoc DL(Op); + const SDValue Args[8] = { + Chain, + Op.getOperand(2), // Export Value + Op.getOperand(3), // ArrayBase + Op.getOperand(4), // Type + DAG.getConstant(0, DL, MVT::i32), // SWZ_X + DAG.getConstant(1, DL, MVT::i32), // SWZ_Y + DAG.getConstant(2, DL, MVT::i32), // SWZ_Z + DAG.getConstant(3, DL, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); + } + + // default for switch(IntrinsicID) + default: break; + } + // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + switch(IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case AMDGPUIntrinsic::R600_load_input: { + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(Reg); + return DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), Reg, VT); + } + + case AMDGPUIntrinsic::R600_interp_input: { + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); + MachineSDNode *interp; + if (ijb < 0) { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, + MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); + return DAG.getTargetExtractSubreg( + TII->getRegisterInfo().getSubRegFromChannel(slot % 4), + DL, MVT::f32, SDValue(interp, 0)); + } + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); + unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); + MRI.addLiveIn(RegisterI); + MRI.addLiveIn(RegisterJ); + SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); + SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); + + if (slot % 4 < 2) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + return SDValue(interp, slot % 2); + } + case AMDGPUIntrinsic::R600_interp_xy: + case AMDGPUIntrinsic::R600_interp_zw: { + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + MachineSDNode *interp; + SDValue RegisterINode = Op.getOperand(2); + SDValue RegisterJNode = Op.getOperand(3); + + if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, + SDValue(interp, 0), SDValue(interp, 1)); + } + case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::R600_ldptr: { + unsigned TextureOp; + switch (IntrinsicID) { + case AMDGPUIntrinsic::R600_tex: + TextureOp = 0; + break; + case AMDGPUIntrinsic::R600_texc: + TextureOp = 1; + break; + case AMDGPUIntrinsic::R600_txl: + TextureOp = 2; + break; + case AMDGPUIntrinsic::R600_txlc: + TextureOp = 3; + break; + case AMDGPUIntrinsic::R600_txb: + TextureOp = 4; + break; + case AMDGPUIntrinsic::R600_txbc: + TextureOp = 5; + break; + case AMDGPUIntrinsic::R600_txf: + TextureOp = 6; + break; + case AMDGPUIntrinsic::R600_txq: + TextureOp = 7; + break; + case AMDGPUIntrinsic::R600_ddx: + TextureOp = 8; + break; + case AMDGPUIntrinsic::R600_ddy: + TextureOp = 9; + break; + case AMDGPUIntrinsic::R600_ldptr: + TextureOp = 10; + break; + default: + llvm_unreachable("Unknow Texture Operation"); + } + + SDValue TexArgs[19] = { + DAG.getConstant(TextureOp, DL, MVT::i32), + Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10) + }; + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); + } + case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(3, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(3, DL, MVT::i32)) + }; + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); + } + + case Intrinsic::r600_read_ngroups_x: + return LowerImplicitParameter(DAG, VT, DL, 0); + case Intrinsic::r600_read_ngroups_y: + return LowerImplicitParameter(DAG, VT, DL, 1); + case Intrinsic::r600_read_ngroups_z: + return LowerImplicitParameter(DAG, VT, DL, 2); + case Intrinsic::r600_read_global_size_x: + return LowerImplicitParameter(DAG, VT, DL, 3); + case Intrinsic::r600_read_global_size_y: + return LowerImplicitParameter(DAG, VT, DL, 4); + case Intrinsic::r600_read_global_size_z: + return LowerImplicitParameter(DAG, VT, DL, 5); + case Intrinsic::r600_read_local_size_x: + return LowerImplicitParameter(DAG, VT, DL, 6); + case Intrinsic::r600_read_local_size_y: + return LowerImplicitParameter(DAG, VT, DL, 7); + case Intrinsic::r600_read_local_size_z: + return LowerImplicitParameter(DAG, VT, DL, 8); + + case Intrinsic::AMDGPU_read_workdim: + return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) + break; + } + } // end switch(Op.getOpcode()) + return SDValue(); +} + +void R600TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); + return; + case ISD::FP_TO_UINT: + if (N->getValueType(0) == MVT::i1) { + Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + } + // Fall-through. Since we don't care about out of bounds values + // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint + // considers some extra cases which are not necessary here. + case ISD::FP_TO_SINT: { + SDValue Result; + if (expandFP_TO_SINT(N, Result, DAG)) + Results.push_back(Result); + return; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + LowerUDIVREM64(Op, DAG, Results); + break; + } + } +} + +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector<SDValue, 8> Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy()))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); +} + +SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + // On hw >= R700, COS/SIN input must be between -1. and 1. + // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDLoc DL(Op); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.15915494309, DL, MVT::f32)), + DAG.getConstantFP(0.5, DL, MVT::f32))); + unsigned TrigNode; + switch (Op.getOpcode()) { + case ISD::FCOS: + TrigNode = AMDGPUISD::COS_HW; + break; + case ISD::FSIN: + TrigNode = AMDGPUISD::SIN_HW; + break; + default: + llvm_unreachable("Wrong trig opcode"); + } + SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, FractPart, + DAG.getConstantFP(-0.5, DL, MVT::f32))); + if (Gen >= AMDGPUSubtarget::R700) + return TrigVal; + // On R600 hw, COS/SIN input must be between -Pi and Pi. + return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, + DAG.getConstantFP(3.14159265359, DL, MVT::f32)); +} + +SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); + Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); + HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); + SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); + + SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); + SDValue LoBig = Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); + Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); + SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); + LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); + + SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); + SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); + // Extend sign. + OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, + DAG.getValueType(MVT::i1)); + + SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); +} + +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode( + ISD::SETCC, + DL, + MVT::i1, + Op, DAG.getConstantFP(0.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETNE) + ); +} + +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, + unsigned DwordOffset) const { + unsigned ByteOffset = DwordOffset * 4; + PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // We shouldn't be using an offset wider than 16-bits for implicit parameters. + assert(isInt<16>(ByteOffset)); + + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR + MachinePointerInfo(ConstantPointerNull::get(PtrType)), + false, false, false, 0); +} + +bool R600TargetLowering::isZero(SDValue Op) const { + if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + return Cst->isNullValue(); + } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ + return CstFP->isZero(); + } else { + return false; + } +} + +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + SDValue Temp; + + if (VT == MVT::f32) { + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + if (MinMax) + return MinMax; + } + + // LHS and RHS are guaranteed to be the same value type + EVT CompareVT = LHS.getValueType(); + + // Check if we can lower this to a native operation. + + // Try to lower to a SET* instruction: + // + // SET* can match the following patterns: + // + // select_cc f32, f32, -1, 0, cc_supported + // select_cc f32, f32, 1.0f, 0.0f, cc_supported + // select_cc i32, i32, -1, 0, cc_supported + // + + // Move hardware True/False values to the correct operand. + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + ISD::CondCode InverseCC = + ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + if (isHWTrueValue(False) && isHWFalseValue(True)) { + if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + CC = DAG.getCondCode(InverseCC); + } else { + ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); + if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + std::swap(LHS, RHS); + CC = DAG.getCondCode(SwapInvCC); + } + } + } + + if (isHWTrueValue(True) && isHWFalseValue(False) && + (CompareVT == VT || VT == MVT::i32)) { + // This can be matched by a SET* instruction. + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); + } + + // Try to lower to a CND* instruction: + // + // CND* can match the following patterns: + // + // select_cc f32, 0.0, f32, f32, cc_supported + // select_cc f32, 0.0, i32, i32, cc_supported + // select_cc i32, 0, f32, f32, cc_supported + // select_cc i32, 0, i32, i32, cc_supported + // + + // Try to move the zero value to the RHS + if (isZero(LHS)) { + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + // Try swapping the operands + ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } else { + // Try inverting the conditon and then swapping the operands + ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); + CCSwapped = ISD::getSetCCSwappedOperands(CCInv); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(True, False); + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } + } + } + if (isZero(RHS)) { + SDValue Cond = LHS; + SDValue Zero = RHS; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + if (CompareVT != VT) { + // Bitcast True / False to the correct types. This will end up being + // a nop, but it allows us to define only a single pattern in the + // .TD files for each CND* instruction rather than having to have + // one pattern for integer True/False and one for fp True/False + True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); + False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); + } + + switch (CCOpcode) { + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + Temp = True; + True = False; + False = Temp; + break; + default: + break; + } + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + Cond, Zero, + True, False, + DAG.getCondCode(CCOpcode)); + return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); + } + + // If we make it this for it means we have no native instructions to handle + // this SELECT_CC, so we must lower it. + SDValue HWTrue, HWFalse; + + if (CompareVT == MVT::f32) { + HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); + HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); + } else if (CompareVT == MVT::i32) { + HWTrue = DAG.getConstant(-1, DL, CompareVT); + HWFalse = DAG.getConstant(0, DL, CompareVT); + } + else { + llvm_unreachable("Unhandled value type in LowerSELECT_CC"); + } + + // Lower this unsupported SELECT_CC into a combination of two supported + // SELECT_CC operations. + SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); + + return DAG.getNode(ISD::SELECT_CC, DL, VT, + Cond, HWFalse, + True, False, + DAG.getCondCode(ISD::SETNE)); +} + +/// LLVM generates byte-addressed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + SDLoc DL(Ptr); + return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, DL, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Ptr = Op.getOperand(2); + + SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Result.getNode()) { + return Result; + } + + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + if (StoreNode->isTruncatingStore()) { + EVT VT = Value.getValueType(); + assert(VT.bitsLE(MVT::i32)); + EVT MemVT = StoreNode->getMemoryVT(); + SDValue MaskConstant; + if (MemVT == MVT::i8) { + MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); + } else { + assert(MemVT == MVT::i16); + MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); + } + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(0x00000003, DL, VT)); + SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, DL, VT)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 + // vector instead. + SDValue Src[4] = { + ShiftedValue, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + Mask + }; + SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Args[3] = { Chain, Input, DWordAddr }; + return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, + Op->getVTList(), Args, MemVT, + StoreNode->getMemOperand()); + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && + Value.getValueType().bitsGE(MVT::i32)) { + // Convert pointer from byte address to dword address. + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), + Ptr, DAG.getConstant(2, DL, MVT::i32))); + + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { + llvm_unreachable("Truncated and indexed stores not supported yet"); + } else { + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + } + return Chain; + } + } + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) { + return Ret; + } + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SmallVector<SDValue, 4> Stores(NumElemVT); + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, DL, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + } + + return Chain; +} + +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + SDLoc DL(Op); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + if (Ret.getNode()) { + SDValue Ops[2] = { + Ret, + Chain + }; + return DAG.getMergeValues(Ops, DL); + } + + // Lower loads constant address space global variable loads + if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa<GlobalVariable>(GetUnderlyingObject( + LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), + LoadNode->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + } + + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { + SDValue MergedValues[2] = { + ScalarizeVectorLoad(Op, DAG), + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1 && + ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || + (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { + SDValue Result; + if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || + isa<Constant>(LoadNode->getMemOperand()->getValue()) || + isa<ConstantSDNode>(Ptr)) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, + makeArrayRef(Slots, NumElements)); + } else { + // non-constant ptr can't be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(4, DL, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + // For most operations returning SDValue() will result in the node being + // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we + // need to manually expand loads that may be legal in some address spaces and + // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for + // compute shaders, since the data is sign extended when it is uploaded to the + // buffer. However SEXT loads from other address spaces are not supported, so + // we need to expand them here. + if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { + EVT MemVT = LoadNode->getMemoryVT(); + assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); + SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, + LoadNode->getPointerInfo(), MemVT, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + LoadNode->getAlignment()); + SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, + DAG.getValueType(MemVT)); + + SDValue MergedValues[2] = { Res, Chain }; + return DAG.getMergeValues(MergedValues, DL); + } + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2] = { + LoweredLoad, + Chain + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + +/// XXX Only kernel functions are supported, so we can assume for now that +/// every function is a kernel function, but in the future we should use +/// separate calling conventions for kernel and non-kernel functions. +SDValue R600TargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + SmallVector<ISD::InputArg, 8> LocalIns; + + getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); + + AnalyzeFormalArguments(CCInfo, LocalIns); + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + CCValAssign &VA = ArgLocs[i]; + const ISD::InputArg &In = Ins[i]; + EVT VT = In.VT; + EVT MemVT = VA.getLocVT(); + if (!VT.isVector() && MemVT.isVector()) { + // Get load source type if scalarized. + MemVT = MemVT.getVectorElementType(); + } + + if (MFI->getShaderType() != ShaderType::COMPUTE) { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); + SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Register); + continue; + } + + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // i64 isn't a legal type, so the register type used ends up as i32, which + // isn't expected here. It attempts to create this sextload, but it ends up + // being invalid. Somehow this seems to work with i64 arguments, but breaks + // for <1 x i64>. + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + ISD::LoadExtType Ext = ISD::NON_EXTLOAD; + if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. + + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } + + // Compute the offset from the value. + // XXX - I think PartOffset should give you this, but it seems to give the + // size of the register which isn't useful. + + unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); + unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = 36 + VA.getLocMemOffset(); + + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); + SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), + DAG.getUNDEF(MVT::i32), + PtrInfo, + MemVT, false, true, true, 4); + + // 4 is the preferred alignment for the CONSTANT memory space. + InVals.push_back(Arg); + MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); + } + return Chain; +} + +EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) + return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +static SDValue CompactSwizzlableVector( + SelectionDAG &DAG, SDValue VectorEntry, + DenseMap<unsigned, unsigned> &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + // We mask write here to teach later passes that the ith element of this + // vector is undef. Thus we can use it to reduce 128 bits reg usage, + // break false dependencies and additionnaly make assembly easier to read. + RemapSwizzle[i] = 7; // SEL_MASK_WRITE + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { + if (C->isZero()) { + RemapSwizzle[i] = 4; // SEL_0 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } else if (C->isExactlyValue(1.0)) { + RemapSwizzle[i] = 5; // SEL_1 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } + } + + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + continue; + for (unsigned j = 0; j < i; j++) { + if (NewBldVec[i] == NewBldVec[j]) { + NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); + RemapSwizzle[i] = j; + break; + } + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + +static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, + DenseMap<unsigned, unsigned> &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + bool isUnmovable[4] = { false, false, false, false }; + for (unsigned i = 0; i < 4; i++) { + RemapSwizzle[i] = i; + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (i == Idx) + isUnmovable[Idx] = true; + } + } + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (isUnmovable[Idx]) + continue; + // Swap i and Idx + std::swap(NewBldVec[Idx], NewBldVec[i]); + std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); + break; + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + + +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, + SDValue Swz[4], SelectionDAG &DAG, + SDLoc DL) const { + assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); + // Old -> New swizzle values + DenseMap<unsigned, unsigned> SwizzleRemap; + + BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + SwizzleRemap.clear(); + BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + return BuildVector; +} + + +//===----------------------------------------------------------------------===// +// Custom DAG Optimizations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + switch (N->getOpcode()) { + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) + case ISD::FP_ROUND: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { + return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), + Arg.getOperand(0)); + } + break; + } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + SDLoc dl(N); + return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, dl, MVT::i32), // True + DAG.getConstant(0, dl, MVT::i32), // False + SelectCC.getOperand(4)); // CC + + break; + } + + // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx + // => build_vector elt0, ... , NewEltIdx, ... , eltN + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = N->getOperand(0); + SDValue InVal = N->getOperand(1); + SDValue EltNo = N->getOperand(2); + SDLoc dl(N); + + // If the inserted element is an UNDEF, just use the input vector. + if (InVal.getOpcode() == ISD::UNDEF) + return InVec; + + EVT VT = InVec.getValueType(); + + // If we can't generate a legal BUILD_VECTOR, exit + if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + + // Check that we know which element is being inserted + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially + // be converted to a BUILD_VECTOR). Fill in the Ops vector with the + // vector elements. + SmallVector<SDValue, 8> Ops; + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + Ops.append(InVec.getNode()->op_begin(), + InVec.getNode()->op_end()); + } else if (InVec.getOpcode() == ISD::UNDEF) { + unsigned NElts = VT.getVectorNumElements(); + Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); + } else { + return SDValue(); + } + + // Insert the element + if (Elt < Ops.size()) { + // All the operands of BUILD_VECTOR must have the same type; + // we enforce that here. + EVT OpVT = Ops[0].getValueType(); + if (InVal.getValueType() != OpVT) + InVal = OpVT.bitsGT(InVal.getValueType()) ? + DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : + DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); + Ops[Elt] = InVal; + } + + // Return the new vector + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } + + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + if (Arg.getOpcode() == ISD::BITCAST && + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), + Arg->getOperand(0).getOperand(Element)); + } + } + } + + case ISD::SELECT_CC: { + // Try common optimizations + SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + if (Ret.getNode()) + return Ret; + + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + // + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> + // selectcc x, y, a, b, cc + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode()) { + return SDValue(); + } + + switch (NCC) { + default: return SDValue(); + case ISD::SETNE: return LHS; + case ISD::SETEQ: { + ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); + LHSCC = ISD::getSetCCInverse(LHSCC, + LHS.getOperand(0).getValueType().isInteger()); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) + return DAG.getSelectCC(SDLoc(N), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + LHSCC); + break; + } + } + return SDValue(); + } + + case AMDGPUISD::EXPORT: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[8] = { + N->getOperand(0), // Chain + SDValue(), + N->getOperand(2), // ArrayBase + N->getOperand(3), // Type + N->getOperand(4), // SWZ_X + N->getOperand(5), // SWZ_Y + N->getOperand(6), // SWZ_Z + N->getOperand(7) // SWZ_W + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); + } + case AMDGPUISD::TEXTURE_FETCH: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[19] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + N->getOperand(3), + N->getOperand(4), + N->getOperand(5), + N->getOperand(6), + N->getOperand(7), + N->getOperand(8), + N->getOperand(9), + N->getOperand(10), + N->getOperand(11), + N->getOperand(12), + N->getOperand(13), + N->getOperand(14), + N->getOperand(15), + N->getOperand(16), + N->getOperand(17), + N->getOperand(18), + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); + } + } + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +static bool +FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, + SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + if (!Src.isMachineOpcode()) + return false; + switch (Src.getMachineOpcode()) { + case AMDGPU::FNEG_R600: + if (!Neg.getNode()) + return false; + Src = Src.getOperand(0); + Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::FABS_R600: + if (!Abs.getNode()) + return false; + Src = Src.getOperand(0); + Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::CONST_COPY: { + unsigned Opcode = ParentNode->getMachineOpcode(); + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + + if (!Sel.getNode()) + return false; + + SDValue CstOffset = Src.getOperand(0); + if (ParentNode->getValueType(0).isVector()) + return false; + + // Gather constants values + int SrcIndices[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + std::vector<unsigned> Consts; + for (int OtherSrcIdx : SrcIndices) { + int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); + if (OtherSrcIdx < 0 || OtherSelIdx < 0) + continue; + if (HasDst) { + OtherSrcIdx--; + OtherSelIdx--; + } + if (RegisterSDNode *Reg = + dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst + = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); + Consts.push_back(Cst->getZExtValue()); + } + } + } + + ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); + Consts.push_back(Cst->getZExtValue()); + if (!TII->fitsConstReadLimitations(Consts)) { + return false; + } + + Sel = CstOffset; + Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); + return true; + } + case AMDGPU::MOV_IMM_I32: + case AMDGPU::MOV_IMM_F32: { + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + uint64_t ImmValue = 0; + + + if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { + ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); + float FloatValue = FPC->getValueAPF().convertToFloat(); + if (FloatValue == 0.0) { + ImmReg = AMDGPU::ZERO; + } else if (FloatValue == 0.5) { + ImmReg = AMDGPU::HALF; + } else if (FloatValue == 1.0) { + ImmReg = AMDGPU::ONE; + } else { + ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); + } + } else { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); + uint64_t Value = C->getZExtValue(); + if (Value == 0) { + ImmReg = AMDGPU::ZERO; + } else if (Value == 1) { + ImmReg = AMDGPU::ONE_INT; + } else { + ImmValue = Value; + } + } + + // Check that we aren't already using an immediate. + // XXX: It's possible for an instruction to have more than one + // immediate operand, but this is not supported yet. + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + if (!Imm.getNode()) + return false; + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); + assert(C); + if (C->getZExtValue()) + return false; + Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); + } + Src = DAG.getRegister(ImmReg, MVT::i32); + return true; + } + default: + return false; + } +} + + +/// \brief Fold the instructions after selecting them +SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + if (!Node->isMachineOpcode()) + return Node; + unsigned Opcode = Node->getMachineOpcode(); + SDValue FakeOp; + + std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); + + if (Opcode == AMDGPU::DOT_4) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) + }; + for (unsigned i = 0; i < 8; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue &Abs = Ops[AbsIdx[i] - 1]; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + if (HasDst) + SelIdx--; + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::REG_SEQUENCE) { + for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { + SDValue &Src = Ops[i]; + if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::CLAMP_R600) { + SDValue Src = Node->getOperand(0); + if (!Src.isMachineOpcode() || + !TII->hasInstrModifiers(Src.getMachineOpcode())) + return Node; + int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), + AMDGPU::OpName::clamp); + if (ClampIdx < 0) + return Node; + SDLoc DL(Node); + std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); + Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); + return DAG.getMachineNode(Src.getMachineOpcode(), DL, + Node->getVTList(), Ops); + } else { + if (!TII->hasInstrModifiers(Opcode)) + return Node; + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), + -1 + }; + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue FakeAbs; + SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); + if (HasDst) { + SelIdx--; + ImmIdx--; + } + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + SDValue &Imm = Ops[ImmIdx]; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } + + return Node; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h new file mode 100644 index 0000000..c252878 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -0,0 +1,80 @@ +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H + +#include "AMDGPUISelLowering.h" + +namespace llvm { + +class R600InstrInfo; + +class R600TargetLowering : public AMDGPUTargetLowering { +public: + R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + EVT getSetCCResultType(LLVMContext &, EVT VT) const override; +private: + unsigned Gen; + /// Each OpenCL kernel has nine implicit parameters that are stored in the + /// first nine dwords of a Vertex Buffer. These implicit parameters are + /// lowered to load instructions which retrieve the values from the Vertex + /// Buffer. + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, unsigned DwordOffset) const; + + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, + SDLoc DL) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; + bool isZero(SDValue Op) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td new file mode 100644 index 0000000..0ffd485 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td @@ -0,0 +1,495 @@ +//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin> + : AMDGPUInst <outs, ins, asm, pattern> { + + field bits<64> Inst; + bit Trig = 0; + bit Op3 = 0; + bit isVector = 0; + bits<2> FlagOperandIdx = 0; + bit Op1 = 0; + bit Op2 = 0; + bit LDS_1A = 0; + bit LDS_1A1D = 0; + bit HasNativeOperands = 0; + bit VTXInst = 0; + bit TEXInst = 0; + bit ALUInst = 0; + bit IsExport = 0; + bit LDS_1A2D = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = itin; + + // No AsmMatcher support. + let isCodeGenOnly = 1; + + let TSFlags{4} = Trig; + let TSFlags{5} = Op3; + + // Vector instructions are instructions that must fill all slots in an + // instruction group + let TSFlags{6} = isVector; + let TSFlags{8-7} = FlagOperandIdx; + let TSFlags{9} = HasNativeOperands; + let TSFlags{10} = Op1; + let TSFlags{11} = Op2; + let TSFlags{12} = VTXInst; + let TSFlags{13} = TEXInst; + let TSFlags{14} = ALUInst; + let TSFlags{15} = LDS_1A; + let TSFlags{16} = LDS_1A1D; + let TSFlags{17} = IsExport; + let TSFlags{18} = LDS_1A2D; +} + +//===----------------------------------------------------------------------===// +// ALU instructions +//===----------------------------------------------------------------------===// + +class R600_ALU_LDS_Word0 { + field bits<32> Word0; + + bits<11> src0; + bits<1> src0_rel; + bits<11> src1; + bits<1> src1_rel; + bits<3> index_mode = 0; + bits<2> pred_sel; + bits<1> last; + + bits<9> src0_sel = src0{8-0}; + bits<2> src0_chan = src0{10-9}; + bits<9> src1_sel = src1{8-0}; + bits<2> src1_chan = src1{10-9}; + + let Word0{8-0} = src0_sel; + let Word0{9} = src0_rel; + let Word0{11-10} = src0_chan; + let Word0{21-13} = src1_sel; + let Word0{22} = src1_rel; + let Word0{24-23} = src1_chan; + let Word0{28-26} = index_mode; + let Word0{30-29} = pred_sel; + let Word0{31} = last; +} + +class R600ALU_Word0 : R600_ALU_LDS_Word0 { + + bits<1> src0_neg; + bits<1> src1_neg; + + let Word0{12} = src0_neg; + let Word0{25} = src1_neg; +} + +class R600ALU_Word1 { + field bits<32> Word1; + + bits<11> dst; + bits<3> bank_swizzle; + bits<1> dst_rel; + bits<1> clamp; + + bits<7> dst_sel = dst{6-0}; + bits<2> dst_chan = dst{10-9}; + + let Word1{20-18} = bank_swizzle; + let Word1{27-21} = dst_sel; + let Word1{28} = dst_rel; + let Word1{30-29} = dst_chan; + let Word1{31} = clamp; +} + +class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{ + + bits<1> src0_abs; + bits<1> src1_abs; + bits<1> update_exec_mask; + bits<1> update_pred; + bits<1> write; + bits<2> omod; + + let Word1{0} = src0_abs; + let Word1{1} = src1_abs; + let Word1{2} = update_exec_mask; + let Word1{3} = update_pred; + let Word1{4} = write; + let Word1{6-5} = omod; + let Word1{17-7} = alu_inst; +} + +class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{ + + bits<11> src2; + bits<1> src2_rel; + bits<1> src2_neg; + + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{12} = src2_neg; + let Word1{17-13} = alu_inst; +} + +class R600LDS_Word1 { + field bits<32> Word1; + + bits<11> src2; + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + bits<1> src2_rel; + // offset specifies the stride offset to the second set of data to be read + // from. This is a dword offset. + bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP + bits<3> bank_swizzle; + bits<6> lds_op; + bits<2> dst_chan = 0; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{17-13} = alu_inst; + let Word1{20-18} = bank_swizzle; + let Word1{26-21} = lds_op; + let Word1{30-29} = dst_chan; +} + + +/* +XXX: R600 subtarget uses a slightly different encoding than the other +subtargets. We currently handle this in R600MCCodeEmitter, but we may +want to use these instruction classes in the future. + +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { + + bits<1> fog_merge; + bits<10> alu_inst; + + let Inst{37} = fog_merge; + let Inst{39-38} = omod; + let Inst{49-40} = alu_inst; +} + +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { + + bits<11> alu_inst; + + let Inst{38-37} = omod; + let Inst{49-39} = alu_inst; +} +*/ + +//===----------------------------------------------------------------------===// +// Vertex Fetch instructions +//===----------------------------------------------------------------------===// + +class VTX_WORD0 { + field bits<32> Word0; + bits<7> src_gpr; + bits<5> VC_INST; + bits<2> FETCH_TYPE; + bits<1> FETCH_WHOLE_QUAD; + bits<8> BUFFER_ID; + bits<1> SRC_REL; + bits<2> SRC_SEL_X; + + let Word0{4-0} = VC_INST; + let Word0{6-5} = FETCH_TYPE; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = BUFFER_ID; + let Word0{22-16} = src_gpr; + let Word0{23} = SRC_REL; + let Word0{25-24} = SRC_SEL_X; +} + +class VTX_WORD0_eg : VTX_WORD0 { + + bits<6> MEGA_FETCH_COUNT; + + let Word0{31-26} = MEGA_FETCH_COUNT; +} + +class VTX_WORD0_cm : VTX_WORD0 { + + bits<2> SRC_SEL_Y; + bits<2> STRUCTURED_READ; + bits<1> LDS_REQ; + bits<1> COALESCED_READ; + + let Word0{27-26} = SRC_SEL_Y; + let Word0{29-28} = STRUCTURED_READ; + let Word0{30} = LDS_REQ; + let Word0{31} = COALESCED_READ; +} + +class VTX_WORD1_GPR { + field bits<32> Word1; + bits<7> dst_gpr; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<1> USE_CONST_FIELDS; + bits<6> DATA_FORMAT; + bits<2> NUM_FORMAT_ALL; + bits<1> FORMAT_COMP_ALL; + bits<1> SRF_MODE_ALL; + + let Word1{6-0} = dst_gpr; + let Word1{7} = DST_REL; + let Word1{8} = 0; // Reserved + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{21} = USE_CONST_FIELDS; + let Word1{27-22} = DATA_FORMAT; + let Word1{29-28} = NUM_FORMAT_ALL; + let Word1{30} = FORMAT_COMP_ALL; + let Word1{31} = SRF_MODE_ALL; +} + +//===----------------------------------------------------------------------===// +// Texture fetch instructions +//===----------------------------------------------------------------------===// + +class TEX_WORD0 { + field bits<32> Word0; + + bits<5> TEX_INST; + bits<2> INST_MOD; + bits<1> FETCH_WHOLE_QUAD; + bits<8> RESOURCE_ID; + bits<7> SRC_GPR; + bits<1> SRC_REL; + bits<1> ALT_CONST; + bits<2> RESOURCE_INDEX_MODE; + bits<2> SAMPLER_INDEX_MODE; + + let Word0{4-0} = TEX_INST; + let Word0{6-5} = INST_MOD; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = RESOURCE_ID; + let Word0{22-16} = SRC_GPR; + let Word0{23} = SRC_REL; + let Word0{24} = ALT_CONST; + let Word0{26-25} = RESOURCE_INDEX_MODE; + let Word0{28-27} = SAMPLER_INDEX_MODE; +} + +class TEX_WORD1 { + field bits<32> Word1; + + bits<7> DST_GPR; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<7> LOD_BIAS; + bits<1> COORD_TYPE_X; + bits<1> COORD_TYPE_Y; + bits<1> COORD_TYPE_Z; + bits<1> COORD_TYPE_W; + + let Word1{6-0} = DST_GPR; + let Word1{7} = DST_REL; + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{27-21} = LOD_BIAS; + let Word1{28} = COORD_TYPE_X; + let Word1{29} = COORD_TYPE_Y; + let Word1{30} = COORD_TYPE_Z; + let Word1{31} = COORD_TYPE_W; +} + +class TEX_WORD2 { + field bits<32> Word2; + + bits<5> OFFSET_X; + bits<5> OFFSET_Y; + bits<5> OFFSET_Z; + bits<5> SAMPLER_ID; + bits<3> SRC_SEL_X; + bits<3> SRC_SEL_Y; + bits<3> SRC_SEL_Z; + bits<3> SRC_SEL_W; + + let Word2{4-0} = OFFSET_X; + let Word2{9-5} = OFFSET_Y; + let Word2{14-10} = OFFSET_Z; + let Word2{19-15} = SAMPLER_ID; + let Word2{22-20} = SRC_SEL_X; + let Word2{25-23} = SRC_SEL_Y; + let Word2{28-26} = SRC_SEL_Z; + let Word2{31-29} = SRC_SEL_W; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + +class CF_WORD1_R600 { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<3> COUNT; + bits<6> CALL_COUNT; + bits<1> COUNT_3; + bits<1> END_OF_PROGRAM; + bits<1> VALID_PIXEL_MODE; + bits<7> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{12-10} = COUNT; + let Word1{18-13} = CALL_COUNT; + let Word1{19} = COUNT_3; + let Word1{21} = END_OF_PROGRAM; + let Word1{22} = VALID_PIXEL_MODE; + let Word1{29-23} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_WORD0_EG { + field bits<32> Word0; + + bits<24> ADDR; + bits<3> JUMPTABLE_SEL; + + let Word0{23-0} = ADDR; + let Word0{26-24} = JUMPTABLE_SEL; +} + +class CF_WORD1_EG { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<6> COUNT; + bits<1> VALID_PIXEL_MODE; + bits<1> END_OF_PROGRAM; + bits<8> CF_INST; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{15-10} = COUNT; + let Word1{20} = VALID_PIXEL_MODE; + let Word1{21} = END_OF_PROGRAM; + let Word1{29-22} = CF_INST; + let Word1{31} = BARRIER; +} + +class CF_ALU_WORD0 { + field bits<32> Word0; + + bits<22> ADDR; + bits<4> KCACHE_BANK0; + bits<4> KCACHE_BANK1; + bits<2> KCACHE_MODE0; + + let Word0{21-0} = ADDR; + let Word0{25-22} = KCACHE_BANK0; + let Word0{29-26} = KCACHE_BANK1; + let Word0{31-30} = KCACHE_MODE0; +} + +class CF_ALU_WORD1 { + field bits<32> Word1; + + bits<2> KCACHE_MODE1; + bits<8> KCACHE_ADDR0; + bits<8> KCACHE_ADDR1; + bits<7> COUNT; + bits<1> ALT_CONST; + bits<4> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{1-0} = KCACHE_MODE1; + let Word1{9-2} = KCACHE_ADDR0; + let Word1{17-10} = KCACHE_ADDR1; + let Word1{24-18} = COUNT; + let Word1{25} = ALT_CONST; + let Word1{29-26} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_ALLOC_EXPORT_WORD0_RAT { + field bits<32> Word0; + + bits<4> rat_id; + bits<6> rat_inst; + bits<2> rim; + bits<2> type; + bits<7> rw_gpr; + bits<1> rw_rel; + bits<7> index_gpr; + bits<2> elem_size; + + let Word0{3-0} = rat_id; + let Word0{9-4} = rat_inst; + let Word0{10} = 0; // Reserved + let Word0{12-11} = rim; + let Word0{14-13} = type; + let Word0{21-15} = rw_gpr; + let Word0{22} = rw_rel; + let Word0{29-23} = index_gpr; + let Word0{31-30} = elem_size; +} + +class CF_ALLOC_EXPORT_WORD1_BUF { + field bits<32> Word1; + + bits<12> array_size; + bits<4> comp_mask; + bits<4> burst_count; + bits<1> vpm; + bits<1> eop; + bits<8> cf_inst; + bits<1> mark; + bits<1> barrier; + + let Word1{11-0} = array_size; + let Word1{15-12} = comp_mask; + let Word1{19-16} = burst_count; + let Word1{20} = vpm; + let Word1{21} = eop; + let Word1{29-22} = cf_inst; + let Word1{30} = mark; + let Word1{31} = barrier; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp new file mode 100644 index 0000000..5ef883c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -0,0 +1,1435 @@ +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + +#include "R600InstrInfo.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenDFAPacketizer.inc" + +R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { + return RI; +} + +bool R600InstrInfo::isTrig(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; +} + +bool R600InstrInfo::isVector(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; +} + +void +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + unsigned VectorComponents = 0; + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { + VectorComponents = 4; + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { + VectorComponents = 2; + } + + if (VectorComponents > 0) { + for (unsigned I = 0; I < VectorComponents; I++) { + unsigned SubRegIndex = RI.getSubRegFromChannel(I); + buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + RI.getSubReg(DestReg, SubRegIndex), + RI.getSubReg(SrcReg, SubRegIndex)) + .addReg(DestReg, + RegState::Define | RegState::Implicit); + } + } else { + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + DestReg, SrcReg); + NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) + .setIsKill(KillSrc); + } +} + +/// \returns true if \p MBBI can be moved into a new basic. +bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), + E = MBBI->operands_end(); I != E; ++I) { + if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && + I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) + return false; + } + return true; +} + +bool R600InstrInfo::isMov(unsigned Opcode) const { + + + switch(Opcode) { + default: return false; + case AMDGPU::MOV: + case AMDGPU::MOV_IMM_F32: + case AMDGPU::MOV_IMM_I32: + return true; + } +} + +// Some instructions act as place holders to emulate operations that the GPU +// hardware does automatically. This function can be used to check if +// an opcode falls into this category. +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { + switch (Opcode) { + default: return false; + case AMDGPU::RETURN: + return true; + } +} + +bool R600InstrInfo::isReductionOp(unsigned Opcode) const { + return false; +} + +bool R600InstrInfo::isCubeOp(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::CUBE_r600_pseudo: + case AMDGPU::CUBE_r600_real: + case AMDGPU::CUBE_eg_pseudo: + case AMDGPU::CUBE_eg_real: + return true; + } +} + +bool R600InstrInfo::isALUInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return (TargetFlags & R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::OP1) | + (TargetFlags & R600_InstFlag::OP2) | + (TargetFlags & R600_InstFlag::OP3)); +} + +bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::LDS_1A) | + (TargetFlags & R600_InstFlag::LDS_1A1D) | + (TargetFlags & R600_InstFlag::LDS_1A2D)); +} + +bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; +} + +bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; +} + +bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { + if (isALUInstr(MI->getOpcode())) + return true; + if (isVector(*MI) || isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } +} + +bool R600InstrInfo::isTransOnly(unsigned Opcode) const { + if (ST.hasCaymanISA()) + return false; + return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); +} + +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { + return isTransOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { + return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); +} + +bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { + return isVectorOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isExport(unsigned Opcode) const { + return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT); +} + +bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { + return ST.hasVertexCache() && IS_VTX(get(Opcode)); +} + +bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return MFI->getShaderType() != ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode()); +} + +bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { + return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); +} + +bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return (MFI->getShaderType() == ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); +} + +bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::KILLGT: + case AMDGPU::GROUP_BARRIER: + return true; + default: + return false; + } +} + +bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { + if (!isALUInstr(MI->getOpcode())) { + return false; + } + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + if (!I->isReg() || !I->isUse() || + TargetRegisterInfo::isVirtualRegister(I->getReg())) + continue; + + if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) + return true; + } + return false; +} + +int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { + static const unsigned OpTable[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + + assert (SrcNum < 3); + return getOperandIdx(Opcode, OpTable[SrcNum]); +} + +int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { + static const unsigned SrcSelTable[][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} + }; + + for (const auto &Row : SrcSelTable) { + if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { + return getOperandIdx(Opcode, Row[1]); + } + } + return -1; +} + +SmallVector<std::pair<MachineOperand *, int64_t>, 3> +R600InstrInfo::getSrcs(MachineInstr *MI) const { + SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result; + + if (MI->getOpcode() == AMDGPU::DOT_4) { + static const unsigned OpTable[8][2] = { + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, + }; + + for (unsigned j = 0; j < 8; j++) { + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][0])); + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][1])).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + continue; + } + + } + return Result; + } + + static const unsigned OpTable[3][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + }; + + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (SrcIdx < 0) + break; + MachineOperand &MO = MI->getOperand(SrcIdx); + unsigned Reg = MI->getOperand(SrcIdx).getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand( + getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + continue; + } + if (Reg == AMDGPU::ALU_LITERAL_X) { + unsigned Imm = MI->getOperand( + getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm)); + continue; + } + Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0)); + } + return Result; +} + +std::vector<std::pair<int, unsigned> > +R600InstrInfo::ExtractSrcs(MachineInstr *MI, + const DenseMap<unsigned, unsigned> &PV, + unsigned &ConstCount) const { + ConstCount = 0; + ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); + const std::pair<int, unsigned> DummyPair(-1, 0); + std::vector<std::pair<int, unsigned> > Result; + unsigned i = 0; + for (unsigned n = Srcs.size(); i < n; ++i) { + unsigned Reg = Srcs[i].first->getReg(); + unsigned Index = RI.getEncodingValue(Reg) & 0xff; + if (Reg == AMDGPU::OQAP) { + Result.push_back(std::pair<int, unsigned>(Index, 0)); + } + if (PV.find(Reg) != PV.end()) { + // 255 is used to tells its a PS/PV reg + Result.push_back(std::pair<int, unsigned>(255, 0)); + continue; + } + if (Index > 127) { + ConstCount++; + Result.push_back(DummyPair); + continue; + } + unsigned Chan = RI.getHWRegChan(Reg); + Result.push_back(std::pair<int, unsigned>(Index, Chan)); + } + for (; i < 3; ++i) + Result.push_back(DummyPair); + return Result; +} + +static std::vector<std::pair<int, unsigned> > +Swizzle(std::vector<std::pair<int, unsigned> > Src, + R600InstrInfo::BankSwizzle Swz) { + if (Src[0] == Src[1]) + Src[1].first = -1; + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: + break; + case R600InstrInfo::ALU_VEC_021_SCL_122: + std::swap(Src[1], Src[2]); + break; + case R600InstrInfo::ALU_VEC_102_SCL_221: + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_120_SCL_212: + std::swap(Src[0], Src[1]); + std::swap(Src[0], Src[2]); + break; + case R600InstrInfo::ALU_VEC_201: + std::swap(Src[0], Src[2]); + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_210: + std::swap(Src[0], Src[2]); + break; + } + return Src; +} + +static unsigned +getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: { + unsigned Cycles[3] = { 2, 1, 0}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_021_SCL_122: { + unsigned Cycles[3] = { 1, 2, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_120_SCL_212: { + unsigned Cycles[3] = { 2, 1, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_102_SCL_221: { + unsigned Cycles[3] = { 2, 2, 1}; + return Cycles[Op]; + } + default: + llvm_unreachable("Wrong Swizzle for Trans Slot"); + return 0; + } +} + +/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed +/// in the same Instruction Group while meeting read port limitations given a +/// Swz swizzle sequence. +unsigned R600InstrInfo::isLegalUpTo( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<R600InstrInfo::BankSwizzle> &Swz, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + int Vector[4][3]; + memset(Vector, -1, sizeof(Vector)); + for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { + const std::vector<std::pair<int, unsigned> > &Srcs = + Swizzle(IGSrcs[i], Swz[i]); + for (unsigned j = 0; j < 3; j++) { + const std::pair<int, unsigned> &Src = Srcs[j]; + if (Src.first < 0 || Src.first == 255) + continue; + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && + Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { + // The value from output queue A (denoted by register OQAP) can + // only be fetched during the first cycle. + return false; + } + // OQAP does not count towards the normal read port restrictions + continue; + } + if (Vector[Src.second][j] < 0) + Vector[Src.second][j] = Src.first; + if (Vector[Src.second][j] != Src.first) + return i; + } + } + // Now check Trans Alu + for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) { + const std::pair<int, unsigned> &Src = TransSrcs[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (Src.first == 255) + continue; + if (Vector[Src.second][Cycle] < 0) + Vector[Src.second][Cycle] = Src.first; + if (Vector[Src.second][Cycle] != Src.first) + return IGSrcs.size() - 1; + } + return IGSrcs.size(); +} + +/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next +/// (in lexicographic term) swizzle sequence assuming that all swizzles after +/// Idx can be skipped +static bool +NextPossibleSolution( + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + unsigned Idx) { + assert(Idx < SwzCandidate.size()); + int ResetIdx = Idx; + while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210) + ResetIdx --; + for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) { + SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210; + } + if (ResetIdx == -1) + return false; + int NextSwizzle = SwzCandidate[ResetIdx] + 1; + SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle; + return true; +} + +/// Enumerate all possible Swizzle sequence to find one that can meet all +/// read port requirements. +bool R600InstrInfo::FindSwizzleForVectorSlot( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + unsigned ValidUpTo = 0; + do { + ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz); + if (ValidUpTo == IGSrcs.size()) + return true; + } while (NextPossibleSolution(SwzCandidate, ValidUpTo)); + return false; +} + +/// Instructions in Trans slot can't read gpr at cycle 0 if they also read +/// a const, and can't read a gpr at cycle 1 if they read 2 const. +static bool +isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, + const std::vector<std::pair<int, unsigned> > &TransOps, + unsigned ConstCount) { + // TransALU can't read 3 constants + if (ConstCount > 2) + return false; + for (unsigned i = 0, e = TransOps.size(); i < e; ++i) { + const std::pair<int, unsigned> &Src = TransOps[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (ConstCount > 0 && Cycle == 0) + return false; + if (ConstCount > 1 && Cycle == 1) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, + const DenseMap<unsigned, unsigned> &PV, + std::vector<BankSwizzle> &ValidSwizzle, + bool isLastAluTrans) + const { + //Todo : support shared src0 - src1 operand + + std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs; + ValidSwizzle.clear(); + unsigned ConstCount; + BankSwizzle TransBS = ALU_VEC_012_SCL_210; + for (unsigned i = 0, e = IG.size(); i < e; ++i) { + IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); + unsigned Op = getOperandIdx(IG[i]->getOpcode(), + AMDGPU::OpName::bank_swizzle); + ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) + IG[i]->getOperand(Op).getImm()); + } + std::vector<std::pair<int, unsigned> > TransOps; + if (!isLastAluTrans) + return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); + + TransOps = std::move(IGSrcs.back()); + IGSrcs.pop_back(); + ValidSwizzle.pop_back(); + + static const R600InstrInfo::BankSwizzle TransSwz[] = { + ALU_VEC_012_SCL_210, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221 + }; + for (unsigned i = 0; i < 4; i++) { + TransBS = TransSwz[i]; + if (!isConstCompatible(TransBS, TransOps, ConstCount)) + continue; + bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, + TransBS); + if (Result) { + ValidSwizzle.push_back(TransBS); + return true; + } + } + + return false; +} + + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) + const { + assert (Consts.size() <= 12 && "Too many operands in instructions group"); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + unsigned ReadConstHalf = Consts[i] & 2; + unsigned ReadConstIndex = Consts[i] & (~3); + unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; + if (!Pair1) { + Pair1 = ReadHalfConst; + continue; + } + if (Pair1 == ReadHalfConst) + continue; + if (!Pair2) { + Pair2 = ReadHalfConst; + continue; + } + if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) + const { + std::vector<unsigned> Consts; + SmallSet<int64_t, 4> Literals; + for (unsigned i = 0, n = MIs.size(); i < n; i++) { + MachineInstr *MI = MIs[i]; + if (!isALUInstr(MI->getOpcode())) + continue; + + ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); + + for (unsigned j = 0, e = Srcs.size(); j < e; j++) { + std::pair<MachineOperand *, unsigned> Src = Srcs[j]; + if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) + Literals.insert(Src.second); + if (Literals.size() > 4) + return false; + if (Src.first->getReg() == AMDGPU::ALU_CONST) + Consts.push_back(Src.second); + if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || + AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { + unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; + unsigned Chan = RI.getHWRegChan(Src.first->getReg()); + Consts.push_back((Index << 2) | Chan); + } + } + } + return fitsConstReadLimitations(Consts); +} + +DFAPacketizer * +R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II); +} + +static bool +isPredicateSetter(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::PRED_X: + return true; + default: + return false; + } +} + +static MachineInstr * +findFirstPredicateSetterFrom(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + while (I != MBB.begin()) { + --I; + MachineInstr *MI = I; + if (isPredicateSetter(MI->getOpcode())) + return MI; + } + + return nullptr; +} + +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + +static bool isBranch(unsigned Opcode) { + return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || + Opcode == AMDGPU::BRANCH_COND_f32; +} + +bool +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // Most of the following comes from the ARM implementation of AnalyzeBranch + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + // AMDGPU::BRANCH* instructions are only available after isel and are not + // handled + if (isBranch(I->getOpcode())) + return true; + if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) { + return false; + } + + // Remove successive JUMP + while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { + MachineBasicBlock::iterator PriorI = std::prev(I); + if (AllowModify) + I->removeFromParent(); + I = PriorI; + } + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || + !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { + if (LastOpc == AMDGPU::JUMP) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastOpc == AMDGPU::JUMP_COND) { + MachineInstr *predSet = I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If the block ends with a B and a Bcc, handle it. + if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + MachineInstr *predSet = --I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = SecondLastInst->getOperand(0).getMBB(); + FBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +static +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); + It != E; ++It) { + if (It->getOpcode() == AMDGPU::CF_ALU || + It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return std::prev(It.base()); + } + return MBB.end(); +} + +unsigned +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (!FBB) { + if (Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + return 1; + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 1; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 1; + } + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 2; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 2; + } +} + +unsigned +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + + // Note : we leave PRED* instructions there. + // They may be needed when predicating instructions. + + MachineBasicBlock::iterator I = MBB.end(); + + if (I == MBB.begin()) { + return 0; + } + --I; + switch (I->getOpcode()) { + default: + return 0; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + I = MBB.end(); + + if (I == MBB.begin()) { + return 1; + } + --I; + switch (I->getOpcode()) { + // FIXME: only one case?? + default: + return 1; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + return 2; +} + +bool +R600InstrInfo::isPredicated(const MachineInstr *MI) const { + int idx = MI->findFirstPredOperandIdx(); + if (idx < 0) + return false; + + unsigned Reg = MI->getOperand(idx).getReg(); + switch (Reg) { + default: return false; + case AMDGPU::PRED_SEL_ONE: + case AMDGPU::PRED_SEL_ZERO: + case AMDGPU::PREDICATE_BIT: + return true; + } +} + +bool +R600InstrInfo::isPredicable(MachineInstr *MI) const { + // XXX: KILL* instructions can be predicated, but they must be the last + // instruction in a clause, so this means any instructions after them cannot + // be predicated. Until we have proper support for instruction clauses in the + // backend, we will mark KILL* instructions as unpredicable. + + if (MI->getOpcode() == AMDGPU::KILLGT) { + return false; + } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + // If the clause start in the middle of MBB then the MBB has more + // than a single clause, unable to predicate several clauses. + if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + return false; + // TODO: We don't support KC merging atm + if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) + return false; + return true; + } else if (isVector(*MI)) { + return false; + } else { + return AMDGPUInstrInfo::isPredicable(MI); + } +} + + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const{ + return true; +} + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, + unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, + unsigned ExtraFCycles, + const BranchProbability &Probability) const { + return true; +} + +bool +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + const BranchProbability &Probability) + const { + return true; +} + +bool +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const { + return false; +} + + +bool +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + MachineOperand &MO = Cond[1]; + switch (MO.getImm()) { + case OPCODE_IS_ZERO_INT: + MO.setImm(OPCODE_IS_NOT_ZERO_INT); + break; + case OPCODE_IS_NOT_ZERO_INT: + MO.setImm(OPCODE_IS_ZERO_INT); + break; + case OPCODE_IS_ZERO: + MO.setImm(OPCODE_IS_NOT_ZERO); + break; + case OPCODE_IS_NOT_ZERO: + MO.setImm(OPCODE_IS_ZERO); + break; + default: + return true; + } + + MachineOperand &MO2 = Cond[2]; + switch (MO2.getReg()) { + case AMDGPU::PRED_SEL_ZERO: + MO2.setReg(AMDGPU::PRED_SEL_ONE); + break; + case AMDGPU::PRED_SEL_ONE: + MO2.setReg(AMDGPU::PRED_SEL_ZERO); + break; + default: + return true; + } + return false; +} + +bool +R600InstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + return isPredicateSetter(MI->getOpcode()); +} + + +bool +R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + return false; +} + + +bool +R600InstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const { + int PIdx = MI->findFirstPredOperandIdx(); + + if (MI->getOpcode() == AMDGPU::CF_ALU) { + MI->getOperand(8).setImm(0); + return true; + } + + if (MI->getOpcode() == AMDGPU::DOT_4) { + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) + .setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + return false; +} + +unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { + return 2; +} + +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (PredCost) + *PredCost = 2; + return 2; +} + +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + +void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + MF.getSubtarget().getFrameLowering()); + + unsigned StackWidth = TFL->getStackWidth(MF); + int End = getIndirectIndexEnd(MF); + + if (End == -1) + return; + + for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Reserved.set(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Reserved.set(Reg); + } + } +} + +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + // XXX: Remove when we support a stack width > 2 + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::R600_TReg32_XRegClass; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + AddrReg, ValueReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); + return Mov; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + ValueReg, + AddrReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); + + return Mov; +} + +unsigned R600InstrInfo::getMaxAlusPerClause() const { + return 115; +} + +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg) const { + MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), + DstReg); // $dst + + if (Src1Reg) { + MIB.addImm(0) // $update_exec_mask + .addImm(0); // $update_predicate + } + MIB.addImm(1) // $write + .addImm(0) // $omod + .addImm(0) // $dst_rel + .addImm(0) // $dst_clamp + .addReg(Src0Reg) // $src0 + .addImm(0) // $src0_neg + .addImm(0) // $src0_rel + .addImm(0) // $src0_abs + .addImm(-1); // $src0_sel + + if (Src1Reg) { + MIB.addReg(Src1Reg) // $src1 + .addImm(0) // $src1_neg + .addImm(0) // $src1_rel + .addImm(0) // $src1_abs + .addImm(-1); // $src1_sel + } + + //XXX: The r600g finalizer expects this to be 1, once we've moved the + //scheduling to the backend, we can change the default to 0. + MIB.addImm(1) // $last + .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addImm(0) // $literal + .addImm(0); // $bank_swizzle + + return MIB; +} + +#define OPERAND_CASE(Label) \ + case Label: { \ + static const unsigned Ops[] = \ + { \ + Label##_X, \ + Label##_Y, \ + Label##_Z, \ + Label##_W \ + }; \ + return Ops[Slot]; \ + } + +static unsigned getSlotedOps(unsigned Op, unsigned Slot) { + switch (Op) { + OPERAND_CASE(AMDGPU::OpName::update_exec_mask) + OPERAND_CASE(AMDGPU::OpName::update_pred) + OPERAND_CASE(AMDGPU::OpName::write) + OPERAND_CASE(AMDGPU::OpName::omod) + OPERAND_CASE(AMDGPU::OpName::dst_rel) + OPERAND_CASE(AMDGPU::OpName::clamp) + OPERAND_CASE(AMDGPU::OpName::src0) + OPERAND_CASE(AMDGPU::OpName::src0_neg) + OPERAND_CASE(AMDGPU::OpName::src0_rel) + OPERAND_CASE(AMDGPU::OpName::src0_abs) + OPERAND_CASE(AMDGPU::OpName::src0_sel) + OPERAND_CASE(AMDGPU::OpName::src1) + OPERAND_CASE(AMDGPU::OpName::src1_neg) + OPERAND_CASE(AMDGPU::OpName::src1_rel) + OPERAND_CASE(AMDGPU::OpName::src1_abs) + OPERAND_CASE(AMDGPU::OpName::src1_sel) + OPERAND_CASE(AMDGPU::OpName::pred_sel) + default: + llvm_unreachable("Wrong Operand"); + } +} + +#undef OPERAND_CASE + +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( + MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + const { + assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + unsigned Opcode; + if (ST.getGeneration() <= AMDGPUSubtarget::R700) + Opcode = AMDGPU::DOT4_r600; + else + Opcode = AMDGPU::DOT4_eg; + MachineBasicBlock::iterator I = MI; + MachineOperand &Src0 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); + MachineOperand &Src1 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); + MachineInstr *MIB = buildDefaultInstruction( + MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + static const unsigned Operands[14] = { + AMDGPU::OpName::update_exec_mask, + AMDGPU::OpName::update_pred, + AMDGPU::OpName::write, + AMDGPU::OpName::omod, + AMDGPU::OpName::dst_rel, + AMDGPU::OpName::clamp, + AMDGPU::OpName::src0_neg, + AMDGPU::OpName::src0_rel, + AMDGPU::OpName::src0_abs, + AMDGPU::OpName::src0_sel, + AMDGPU::OpName::src1_neg, + AMDGPU::OpName::src1_rel, + AMDGPU::OpName::src1_abs, + AMDGPU::OpName::src1_sel, + }; + + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); + MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + .setReg(MO.getReg()); + + for (unsigned i = 0; i < 14; i++) { + MachineOperand &MO = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); + assert (MO.isImm()); + setImmOperand(MIB, Operands[i], MO.getImm()); + } + MIB->getOperand(20).setImm(0); + return MIB; +} + +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const { + MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, + AMDGPU::ALU_LITERAL_X); + setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); + return MovImm; +} + +MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const { + return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); +} + +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { + return getOperandIdx(MI.getOpcode(), Op); +} + +int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { + return AMDGPU::getNamedOperandIdx(Opcode, Op); +} + +void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, + int64_t Imm) const { + int Idx = getOperandIdx(*MI, Op); + assert(Idx != -1 && "Operand not supported for this instruction."); + assert(MI->getOperand(Idx).isImm()); + MI->getOperand(Idx).setImm(Imm); +} + +//===----------------------------------------------------------------------===// +// Instruction flag getters/setters +//===----------------------------------------------------------------------===// + +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { + return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; +} + +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + int FlagIndex = 0; + if (Flag != 0) { + // If we pass something other than the default value of Flag to this + // function, it means we are want to set a flag on an instruction + // that uses native encoding. + assert(HAS_NATIVE_OPERANDS(TargetFlags)); + bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; + switch (Flag) { + case MO_FLAG_CLAMP: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); + break; + case MO_FLAG_MASK: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); + break; + case MO_FLAG_NOT_LAST: + case MO_FLAG_LAST: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); + break; + case MO_FLAG_NEG: + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; + case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; + } + break; + + case MO_FLAG_ABS: + assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " + "instructions."); + (void)IsOP3; + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; + } + break; + + default: + FlagIndex = -1; + break; + } + assert(FlagIndex != -1 && "Flag not supported for this instruction"); + } else { + FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); + assert(FlagIndex != 0 && + "Instruction flags not supported for this instruction"); + } + + MachineOperand &FlagOp = MI->getOperand(FlagIndex); + assert(FlagOp.isImm()); + return FlagOp; +} + +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (Flag == 0) { + return; + } + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + if (Flag == MO_FLAG_NOT_LAST) { + clearFlag(MI, Operand, MO_FLAG_LAST); + } else if (Flag == MO_FLAG_MASK) { + clearFlag(MI, Operand, Flag); + } else { + FlagOp.setImm(1); + } + } else { + MachineOperand &FlagOp = getFlagOp(MI, Operand); + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); + } +} + +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + FlagOp.setImm(0); + } else { + MachineOperand &FlagOp = getFlagOp(MI); + unsigned InstFlags = FlagOp.getImm(); + InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); + FlagOp.setImm(InstFlags); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h new file mode 100644 index 0000000..9c5f76c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -0,0 +1,303 @@ +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600InstrInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "R600Defines.h" +#include "R600RegisterInfo.h" +#include <map> + +namespace llvm { + + class AMDGPUTargetMachine; + class DFAPacketizer; + class ScheduleDAG; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class R600InstrInfo : public AMDGPUInstrInfo { + private: + const R600RegisterInfo RI; + + std::vector<std::pair<int, unsigned> > + ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; + + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + public: + enum BankSwizzle { + ALU_VEC_012_SCL_210 = 0, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221, + ALU_VEC_201, + ALU_VEC_210 + }; + + explicit R600InstrInfo(const AMDGPUSubtarget &st); + + const R600RegisterInfo &getRegisterInfo() const override; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; + + bool isTrig(const MachineInstr &MI) const; + bool isPlaceHolderOpcode(unsigned opcode) const; + bool isReductionOp(unsigned opcode) const; + bool isCubeOp(unsigned opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction. + bool isALUInstr(unsigned Opcode) const; + bool hasInstrModifiers(unsigned Opcode) const; + bool isLDSInstr(unsigned Opcode) const; + bool isLDSNoRetInstr(unsigned Opcode) const; + bool isLDSRetInstr(unsigned Opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction or an + /// instruction that will be lowered in ExpandSpecialInstrs Pass. + bool canBeConsideredALU(const MachineInstr *MI) const; + + bool isTransOnly(unsigned Opcode) const; + bool isTransOnly(const MachineInstr *MI) const; + bool isVectorOnly(unsigned Opcode) const; + bool isVectorOnly(const MachineInstr *MI) const; + bool isExport(unsigned Opcode) const; + + bool usesVertexCache(unsigned Opcode) const; + bool usesVertexCache(const MachineInstr *MI) const; + bool usesTextureCache(unsigned Opcode) const; + bool usesTextureCache(const MachineInstr *MI) const; + + bool mustBeLastInClause(unsigned Opcode) const; + bool usesAddressRegister(MachineInstr *MI) const; + bool definesAddressRegister(MachineInstr *MI) const; + bool readsLDSSrcReg(const MachineInstr *MI) const; + + /// \returns The operand index for the given source number. Legal values + /// for SrcNum are 0, 1, and 2. + int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; + /// \returns The operand Index for the Sel operand given an index to one + /// of the instruction's src operands. + int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; + + /// \returns a pair for each src of an ALU instructions. + /// The first member of a pair is the register id. + /// If register is ALU_CONST, second member is SEL. + /// If register is ALU_LITERAL, second member is IMM. + /// Otherwise, second member value is undefined. + SmallVector<std::pair<MachineOperand *, int64_t>, 3> + getSrcs(MachineInstr *MI) const; + + unsigned isLegalUpTo( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<R600InstrInfo::BankSwizzle> &Swz, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + bool FindSwizzleForVectorSlot( + const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, + const std::vector<std::pair<int, unsigned> > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 + /// returns true and the first (in lexical order) BankSwizzle affectation + /// starting from the one already provided in the Instruction Group MIs that + /// fits Read Port limitations in BS if available. Otherwise returns false + /// and undefined content in BS. + /// isLastAluTrans should be set if the last Alu of MIs will be executed on + /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to + /// apply to the last instruction. + /// PV holds GPR to PV registers in the Instruction Group MIs. + bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs, + const DenseMap<unsigned, unsigned> &PV, + std::vector<BankSwizzle> &BS, + bool isLastAluTrans) const; + + /// An instruction group can only access 2 channel pair (either [XY] or [ZW]) + /// from KCache bank on R700+. This function check if MI set in input meet + /// this limitations + bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const; + /// Same but using const index set instead of MI set. + bool fitsConstReadLimitations(const std::vector<unsigned>&) const; + + /// \brief Vector instructions are instructions that must fill all + /// instruction slots within an instruction group. + bool isVector(const MachineInstr &MI) const; + + bool isMov(unsigned Opcode) const override; + + DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const override; + + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + bool isPredicated(const MachineInstr *MI) const override; + + bool isPredicable(MachineInstr *MI) const override; + + bool + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + const BranchProbability &Probability) const override; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const override ; + + bool + isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + const BranchProbability &Probability) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override; + + bool PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const override; + + unsigned int getPredicationCost(const MachineInstr *) const override; + + unsigned int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const override { return 1;} + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + /// \brief Reserve the registers that may be accesed using indirect addressing. + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + unsigned getMaxAlusPerClause() const; + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. + /// You can use this function to avoid manually specifying each instruction + /// modifier operand when building a new instruction. + /// + /// \returns a MachineInstr with all the instruction modifiers initialized + /// to their default values. + MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg = 0) const; + + MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned Slot, + unsigned DstReg) const; + + MachineInstr *buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + + /// \brief Get the index of Op in the MachineInstr. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(const MachineInstr &MI, unsigned Op) const; + + /// \brief Get the index of \p Op for the given Opcode. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(unsigned Opcode, unsigned Op) const; + + /// \brief Helper function for setting instruction flag values. + void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; + + /// \returns true if this instruction has an operand for storing target flags. + bool hasFlagOperand(const MachineInstr &MI) const; + + ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. + void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + + ///\brief Determine if the specified \p Flag is set on this \p Operand. + bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) + /// \param Flag The flag being set. + /// + /// \returns the operand containing the flags for this instruction. + MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + unsigned Flag = 0) const; + + /// \brief Clear the specified flag on the instruction. + void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; +}; + +namespace AMDGPU { + +int getLDSNoRetOp(uint16_t Opcode); + +} //End namespace AMDGPU + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td new file mode 100644 index 0000000..7beed09 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -0,0 +1,1744 @@ +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available on R600 family +// GPUs. +// +//===----------------------------------------------------------------------===// + +include "R600Intrinsics.td" +include "R600InstrFormats.td" + +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : + InstR600 <outs, ins, asm, pattern, NullALU> { + + let Namespace = "AMDGPU"; +} + +def MEMxi : Operand<iPTR> { + let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); + let PrintMethod = "printMemOperand"; +} + +def MEMrr : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); +} + +// Operands for non-registers + +class InstFlag<string PM = "printOperand", int Default = 0> + : OperandWithDefaultOps <i32, (ops (i32 Default))> { + let PrintMethod = PM; +} + +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { + let PrintMethod = "printSel"; +} +def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printBankSwizzle"; +} + +def LITERAL : InstFlag<"printLiteral">; + +def WRITE : InstFlag <"printWrite", 1>; +def OMOD : InstFlag <"printOMOD">; +def REL : InstFlag <"printRel">; +def CLAMP : InstFlag <"printClamp">; +def NEG : InstFlag <"printNeg">; +def ABS : InstFlag <"printAbs">; +def UEM : InstFlag <"printUpdateExecMask">; +def UP : InstFlag <"printUpdatePred">; + +// XXX: The r600g finalizer in Mesa expects last to be one in most cases. +// Once we start using the packetizer in this backend we should have this +// default to 0. +def LAST : InstFlag<"printLast", 1>; +def RSel : Operand<i32> { + let PrintMethod = "printRSel"; +} +def CT: Operand<i32> { + let PrintMethod = "printCT"; +} + +def FRAMEri : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>; +def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; +def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; +def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>; +def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>; + + +def R600_Pred : PredicateOperand<i32, (ops R600_Predicate), + (ops PRED_SEL_OFF)>; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +// Class for instructions with only one source register. +// If you add new ins to this instruction, make sure they are listed before +// $literal, because the backend currently assumes that the last operand is +// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), +// and R600InstrInfo::getOperandIdx(). +class R600_1OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let update_exec_mask = 0; + let update_pred = 0; + let HasNativeOperands = 1; + let Op1 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_1OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin +>; + +// If you add or change the operands for R600_2OP instructions, you must +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). +class R600_2OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, + OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let HasNativeOperands = 1; + let Op2 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_2OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0, + R600_Reg32:$src1))], itin +>; + +// If you add our change the operands for R600_3OP instructions, you must +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and +// R600InstrInfo::getOperandIdx(). +class R600_3OP <bits<5> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $last $dst$dst_rel, " + "$src0_neg$src0$src0_rel, " + "$src1_neg$src1$src1_rel, " + "$src2_neg$src2$src2_rel, " + "$pred_sel" + "$bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP3<inst>{ + + let HasNativeOperands = 1; + let DisableEncoding = "$literal"; + let Op3 = 1; + let UseNamedOperandTable = 1; + let ALUInst = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, + InstrItinClass itin = VecALU> : + InstR600 <(outs R600_Reg32:$dst), + ins, + asm, + pattern, + itin>; + + + +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 13; + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 16; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; + +def TEX_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 14; + }] +>; + +def TEX_ARRAY_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 15; + }] +>; + +class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, + dag outs, dag ins, string asm, list<dag> pattern> : + InstR600ISA <outs, ins, asm, pattern>, + CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF { + + let rat_id = ratid; + let rat_inst = ratinst; + let rim = 0; + // XXX: Have a separate instruction for non-indexed writes. + let type = 1; + let rw_rel = 0; + let elem_size = 0; + + let array_size = 0; + let comp_mask = mask; + let burst_count = 0; + let vpm = 0; + let cf_inst = cfinst; + let mark = 0; + let barrier = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; + +} + +class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>, + VTX_WORD1_GPR { + + // Static fields + let DST_REL = 0; + // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, + // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, + // however, based on my testing if USE_CONST_FIELDS is set, then all + // these fields need to be set to 0. + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 1; + let FORMAT_COMP_ALL = 0; + let SRF_MODE_ALL = 0; + + let Inst{63-32} = Word1; + // LLVM can only encode 64-bit instructions, so these fields are manually + // encoded in R600CodeEmitter + // + // bits<16> OFFSET; + // bits<2> ENDIAN_SWAP = 0; + // bits<1> CONST_BUF_NO_STRIDE = 0; + // bits<1> MEGA_FETCH = 0; + // bits<1> ALT_CONST = 0; + // bits<2> BUFFER_INDEX_MODE = 0; + + // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding + // is done in R600CodeEmitter + // + // Inst{79-64} = OFFSET; + // Inst{81-80} = ENDIAN_SWAP; + // Inst{82} = CONST_BUF_NO_STRIDE; + // Inst{83} = MEGA_FETCH; + // Inst{84} = ALT_CONST; + // Inst{86-85} = BUFFER_INDEX_MODE; + // Inst{95-86} = 0; Reserved + + // VTX_WORD3 (Padding) + // + // Inst{127-96} = 0; + + let VTXInst = 1; +} + +class LoadParamFrag <PatFrag load_type> : PatFrag < + (ops node:$ptr), (load_type node:$ptr), + [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }] +>; + +def load_param : LoadParamFrag<load>; +def load_param_exti8 : LoadParamFrag<az_extloadi8>; +def load_param_exti16 : LoadParamFrag<az_extloadi16>; + +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; + +def isR600toCayman + : Predicate< + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; + +//===----------------------------------------------------------------------===// +// R600 SDNodes +//===----------------------------------------------------------------------===// + +def INTERP_PAIR_XY : AMDGPUShaderInst < + (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", + []>; + +def INTERP_PAIR_ZW : AMDGPUShaderInst < + (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", + []>; + +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", + SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPVariadic] +>; + +def DOT4 : SDNode<"AMDGPUISD::DOT4", + SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, + SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, + SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, + [] +>; + +def COS_HW : SDNode<"AMDGPUISD::COS_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def SIN_HW : SDNode<"AMDGPUISD::SIN_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; + +def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; + +multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> { +def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, + (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), + (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), + (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), + (i32 imm:$DST_SEL_W), + (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID), + (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z), + (i32 imm:$COORD_TYPE_W)), + (inst R600_Reg128:$SRC_GPR, + imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw, + imm:$offsetx, imm:$offsety, imm:$offsetz, + imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z, + imm:$DST_SEL_W, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, + imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z, + imm:$COORD_TYPE_W)>; +} + +//===----------------------------------------------------------------------===// +// Interpolation Instructions +//===----------------------------------------------------------------------===// + +def INTERP_VEC_LOAD : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins i32imm:$src0), + "INTERP_LOAD $src0 : $dst", + [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; + +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { + let bank_swizzle = 5; +} + +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { + let bank_swizzle = 5; +} + +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; + +//===----------------------------------------------------------------------===// +// Export Instructions +//===----------------------------------------------------------------------===// + +def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, + [SDNPHasChain, SDNPSideEffect]>; + +class ExportWord0 { + field bits<32> Word0; + + bits<13> arraybase; + bits<2> type; + bits<7> gpr; + bits<2> elem_size; + + let Word0{12-0} = arraybase; + let Word0{14-13} = type; + let Word0{21-15} = gpr; + let Word0{22} = 0; // RW_REL + let Word0{29-23} = 0; // INDEX_GPR + let Word0{31-30} = elem_size; +} + +class ExportSwzWord1 { + field bits<32> Word1; + + bits<3> sw_x; + bits<3> sw_y; + bits<3> sw_z; + bits<3> sw_w; + bits<1> eop; + bits<8> inst; + + let Word1{2-0} = sw_x; + let Word1{5-3} = sw_y; + let Word1{8-6} = sw_z; + let Word1{11-9} = sw_w; +} + +class ExportBufWord1 { + field bits<32> Word1; + + bits<12> arraySize; + bits<4> compMask; + bits<1> eop; + bits<8> inst; + + let Word1{11-0} = arraySize; + let Word1{15-12} = compMask; +} + +multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { + def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 0, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 7, 0, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy (i32 imm:$type)), + (ExportInst + (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy 1), + (ExportInst + (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), + (ExportInst R600_Reg128:$src, imm:$type, imm:$base, + imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) + >; + +} + +multiclass SteamOutputExportPattern<Instruction ExportInst, + bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { +// Stream0 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf0inst, 0)>; +// Stream1 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf1inst, 0)>; +// Stream2 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf2inst, 0)>; +// Stream3 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf3inst, 0)>; +} + +// Export Instructions should not be duplicated by TailDuplication pass +// (which assumes that duplicable instruction are affected by exec mask) +let usesCustomInserter = 1, isNotDuplicable = 1 in { + +class ExportSwzInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst, + i32imm:$eop), + !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"), + []>, ExportWord0, ExportSwzWord1 { + let elem_size = 3; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +} // End usesCustomInserter = 1 + +class ExportBufInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), + !strconcat("EXPORT", " $gpr"), + []>, ExportWord0, ExportBufWord1 { + let elem_size = 0; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + + +def KCACHE : InstFlag<"printKCache">; + +class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs), +(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, +KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, +i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, +i32imm:$COUNT, i32imm:$Enabled), +!strconcat(OpName, " $COUNT, @$ADDR, " +"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), +[] >, CF_ALU_WORD0, CF_ALU_WORD1 { + field bits<64> Inst; + + let CF_INST = inst; + let ALT_CONST = 0; + let WHOLE_QUAD_MODE = 0; + let BARRIER = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_WORD0_R600 { + field bits<32> Word0; + + bits<32> ADDR; + + let Word0 = ADDR; +} + +class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { + field bits<64> Inst; + bits<4> CNT; + + let CF_INST = inst; + let BARRIER = 1; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let COUNT = CNT{2-0}; + let CALL_COUNT = 0; + let COUNT_3 = CNT{3}; + let END_OF_PROGRAM = 0; + let WHOLE_QUAD_MODE = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { + field bits<64> Inst; + + let CF_INST = inst; + let BARRIER = 1; + let JUMPTABLE_SEL = 0; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let END_OF_PROGRAM = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +def CF_ALU : ALU_CLAUSE<8, "ALU">; +def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; +def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; +def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; +def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; +def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; + +def FETCH_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def ALU_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def LITERALS : AMDGPUInst <(outs), +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + let isCodeGenOnly = 1; + + field bits<64> Inst; + bits<32> literal1; + bits<32> literal2; + + let Inst{31-0} = literal1; + let Inst{63-32} = literal2; +} + +def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { + field bits<64> Inst; +} + +let Predicates = [isR600toCayman] in { + +//===----------------------------------------------------------------------===// +// Common Instructions R600, R700, Evergreen, Cayman +//===----------------------------------------------------------------------===// + +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; +// Non-IEEE MUL: 0 * anything = 0 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; +// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx +// DX10 min/max returns the other operand if one is NaN, +// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic +def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; +def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; + +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, +// so some of the instruction names don't match the asm string. +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. +def SETE : R600_2OP < + 0x08, "SETE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] +>; + +def SGT : R600_2OP < + 0x09, "SETGT", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] +>; + +def SGE : R600_2OP < + 0xA, "SETGE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] +>; + +def SNE : R600_2OP < + 0xB, "SETNE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] +>; + +def SETE_DX10 : R600_2OP < + 0xC, "SETE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))] +>; + +def SETGT_DX10 : R600_2OP < + 0xD, "SETGT_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))] +>; + +def SETGE_DX10 : R600_2OP < + 0xE, "SETGE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] +>; + +// FIXME: This should probably be COND_ONE +def SETNE_DX10 : R600_2OP < + 0xF, "SETNE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] +>; + +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; + +def MOV : R600_1OP <0x19, "MOV", []>; + +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { + +class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < + (outs R600_Reg32:$dst), + (ins immType:$imm), + "", + [] +>; + +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 + +def MOV_IMM_I32 : MOV_IMM<i32, i32imm>; +def : Pat < + (imm:$val), + (MOV_IMM_I32 imm:$val) +>; + +def MOV_IMM_F32 : MOV_IMM<f32, f32imm>; +def : Pat < + (fpimm:$val), + (MOV_IMM_F32 fpimm:$val) +>; + +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; + +let hasSideEffects = 1 in { + +def KILLGT : R600_2OP <0x2D, "KILLGT", []>; + +} // end hasSideEffects + +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; + +def SETE_INT : R600_2OP < + 0x3A, "SETE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] +>; + +def SETGT_INT : R600_2OP < + 0x3B, "SETGT_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] +>; + +def SETGE_INT : R600_2OP < + 0x3C, "SETGE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] +>; + +def SETNE_INT : R600_2OP < + 0x3D, "SETNE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] +>; + +def SETGT_UINT : R600_2OP < + 0x3E, "SETGT_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] +>; + +def SETGE_UINT : R600_2OP < + 0x3F, "SETGE_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] +>; + +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; + +def CNDE_INT : R600_3OP < + 0x1C, "CNDE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] +>; + +def CNDGE_INT : R600_3OP < + 0x1E, "CNDGE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))] +>; + +def CNDGT_INT : R600_3OP < + 0x1D, "CNDGT_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))] +>; + +//===----------------------------------------------------------------------===// +// Texture instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +class R600_TEX <bits<11> inst, string opName> : + InstR600 <(outs R600_Reg128:$DST_GPR), + (ins R600_Reg128:$SRC_GPR, + RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw, + i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz, + RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W, + i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, + CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, + CT:$COORD_TYPE_W), + !strconcat(opName, + " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " + "$SRC_GPR.$srcx$srcy$srcz$srcw " + "RID:$RESOURCE_ID SID:$SAMPLER_ID " + "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"), + [], + NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let TEX_INST = inst{4-0}; + let SRC_REL = 0; + let DST_REL = 0; + let LOD_BIAS = 0; + + let INST_MOD = 0; + let FETCH_WHOLE_QUAD = 0; + let ALT_CONST = 0; + let SAMPLER_INDEX_MODE = 0; + let RESOURCE_INDEX_MODE = 0; + + let TEXInst = 1; +} + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + + + +def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">; +def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">; +def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">; +def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">; +def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">; +def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">; +def TEX_LD : R600_TEX <0x03, "TEX_LD">; +def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> { + let INST_MOD = 1; +} +def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">; +def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">; +def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">; +def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">; +def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">; +def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">; +def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">; + +defm : TexPattern<0, TEX_SAMPLE>; +defm : TexPattern<1, TEX_SAMPLE_C>; +defm : TexPattern<2, TEX_SAMPLE_L>; +defm : TexPattern<3, TEX_SAMPLE_C_L>; +defm : TexPattern<4, TEX_SAMPLE_LB>; +defm : TexPattern<5, TEX_SAMPLE_C_LB>; +defm : TexPattern<6, TEX_LD, v4i32>; +defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>; +defm : TexPattern<8, TEX_GET_GRADIENTS_H>; +defm : TexPattern<9, TEX_GET_GRADIENTS_V>; +defm : TexPattern<10, TEX_LDPTR, v4i32>; + +//===----------------------------------------------------------------------===// +// Helper classes for common instructions +//===----------------------------------------------------------------------===// + +class MUL_LIT_Common <bits<5> inst> : R600_3OP < + inst, "MUL_LIT", + [] +>; + +class MULADD_Common <bits<5> inst> : R600_3OP < + inst, "MULADD", + [] +>; + +class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < + inst, "MULADD_IEEE", + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] +>; + +class FMA_Common <bits<5> inst> : R600_3OP < + inst, "FMA", + [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU +>; + +class CNDE_Common <bits<5> inst> : R600_3OP < + inst, "CNDE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] +>; + +class CNDGT_Common <bits<5> inst> : R600_3OP < + inst, "CNDGT", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] +> { + let Itinerary = VecALU; +} + +class CNDGE_Common <bits<5> inst> : R600_3OP < + inst, "CNDGE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] +> { + let Itinerary = VecALU; +} + + +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins +// Slot X + UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, + OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, + OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, + OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, + OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU> { + + let UseNamedOperandTable = 1; + +} +} + +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 + R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, + R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, + R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, + R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; + + +class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { +multiclass CUBE_Common <bits<11> inst> { + + def _pseudo : InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0), + "CUBE $dst $src0", + [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], + VecALU + > { + let isPseudo = 1; + let UseNamedOperandTable = 1; + } + + def _real : R600_2OP <inst, "CUBE", []>; +} +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + +class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "EXP_IEEE", fexp2 +> { + let Itinerary = TransALU; +} + +class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_INT", fp_to_sint +> { + let Itinerary = TransALU; +} + +class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "INT_TO_FLT", sint_to_fp +> { + let Itinerary = TransALU; +} + +class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_UINT", fp_to_uint +> { + let Itinerary = TransALU; +} + +class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "UINT_TO_FLT", uint_to_fp +> { + let Itinerary = TransALU; +} + +class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "LOG_CLAMPED", [] +>; + +class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "LOG_IEEE", flog2 +> { + let Itinerary = TransALU; +} + +class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; +class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; +class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; +class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI_INT", mulhs +> { + let Itinerary = TransALU; +} +class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI", mulhu +> { + let Itinerary = TransALU; +} +class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULLO_INT", mul +> { + let Itinerary = TransALU; +} +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> { + let Itinerary = TransALU; +} + +class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_CLAMPED", [] +> { + let Itinerary = TransALU; +} + +class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] +> { + let Itinerary = TransALU; +} + +class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIP_UINT", AMDGPUurecip +> { + let Itinerary = TransALU; +} + +// Clamped to maximum. +class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped +> { + let Itinerary = TransALU; +} + +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy +> { + let Itinerary = TransALU; +} + +// TODO: There is also RECIPSQRT_FF which clamps to zero. + +class SIN_Common <bits<11> inst> : R600_1OP < + inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ + let Trig = 1; + let Itinerary = TransALU; +} + +class COS_Common <bits<11> inst> : R600_1OP < + inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> { + let Trig = 1; + let Itinerary = TransALU; +} + +def CLAMP_R600 : CLAMP <R600_Reg32>; +def FABS_R600 : FABS<R600_Reg32>; +def FNEG_R600 : FNEG<R600_Reg32>; + +//===----------------------------------------------------------------------===// +// Helper patterns for complex intrinsics +//===----------------------------------------------------------------------===// + +// FIXME: Should be predicated on unsafe fp math. +multiclass DIV_Common <InstR600 recip_ieee> { +def : Pat< + (int_AMDGPU_div f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : Pat< + (fdiv f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : RcpPat<recip_ieee, f32>; +} + +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> + : Pat < + (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) +>; + +//===----------------------------------------------------------------------===// +// R600 / R700 Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isR600] in { + + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; + def MULADD_r600 : MULADD_Common<0x10>; + def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; + def CNDE_r600 : CNDE_Common<0x18>; + def CNDGT_r600 : CNDGT_Common<0x19>; + def CNDGE_r600 : CNDGE_Common<0x1A>; + def DOT4_r600 : DOT4_Common<0x50>; + defm CUBE_r600 : CUBE_Common<0x52>; + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; + def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; + def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; + def SIN_r600 : SIN_Common<0x6E>; + def COS_r600 : COS_Common<0x6F>; + def ASHR_r600 : ASHR_Common<0x70>; + def LSHR_r600 : LSHR_Common<0x71>; + def LSHL_r600 : LSHL_Common<0x72>; + def MULLO_INT_r600 : MULLO_INT_Common<0x73>; + def MULHI_INT_r600 : MULHI_INT_Common<0x74>; + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; + def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; + + defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; + def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; + + def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; + def : RsqPat<RECIPSQRT_IEEE_r600, f32>; + + def R600_ExportSwz : ExportSwzInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : ExportPattern<R600_ExportSwz, 39>; + + def R600_ExportBuf : ExportBufInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>; + + def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT), + "TEX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT), + "VTX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), + "PUSH_ELSE @$ADDR"> { + let CNT = 0; + let POP_COUNT = 0; // FIXME? + } + def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { + let ADDR = 0; + let CNT = 0; + let POP_COUNT = 0; + } + def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { + let CNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + +} + + +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; + + +//===----------------------------------------------------------------------===// +// Pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +def PRED_X : InstR600 < + (outs R600_Predicate_Bit:$dst), + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), + "", [], NullALU> { + let FlagOperandIdx = 3; +} + +let isTerminator = 1, isBranch = 1 in { +def JUMP_COND : InstR600 < + (outs), + (ins brtarget:$target, R600_Predicate_Bit:$p), + "JUMP $target ($p)", + [], AnyALU + >; + +def JUMP : InstR600 < + (outs), + (ins brtarget:$target), + "JUMP $target", + [], AnyALU + > +{ + let isPredicable = 1; + let isBarrier = 1; +} + +} // End isTerminator = 1, isBranch = 1 + +let usesCustomInserter = 1 in { + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { + +def MASK_WRITE : AMDGPUShaderInst < + (outs), + (ins R600_Reg32:$src), + "MASK_WRITE $src", + [] +>; + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 + + +def TXD: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + NullALU > { + let TEXInst = 1; +} + +def TXD_SHADOW: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], + NullALU +> { + let TEXInst = 1; +} +} // End isPseudo = 1 +} // End usesCustomInserter = 1 + + +//===----------------------------------------------------------------------===// +// Constant Buffer Addressing Support +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +def CONST_COPY : Instruction { + let OutOperandList = (outs R600_Reg32:$dst); + let InOperandList = (ins i32imm:$src); + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let AsmString = "CONST_COPY"; + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let Itinerary = NullALU; +} +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" + +def TEX_VTX_CONSTBUF : + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + VTX_WORD1_GPR, VTX_WORD0_eg { + + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let SRC_REL = 0; + let SRC_SEL_X = 0; + let DST_REL = 0; + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 2; + let FORMAT_COMP_ALL = 1; + let SRF_MODE_ALL = 1; + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 35; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +def TEX_VTX_TEXBUF: + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", + [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, +VTX_WORD1_GPR, VTX_WORD0_eg { + +let VC_INST = 0; +let FETCH_TYPE = 2; +let FETCH_WHOLE_QUAD = 0; +let SRC_REL = 0; +let SRC_SEL_X = 0; +let DST_REL = 0; +let USE_CONST_FIELDS = 1; +let NUM_FORMAT_ALL = 0; +let FORMAT_COMP_ALL = 0; +let SRF_MODE_ALL = 1; +let MEGA_FETCH_COUNT = 16; +let DST_SEL_X = 0; +let DST_SEL_Y = 1; +let DST_SEL_Z = 2; +let DST_SEL_W = 3; +let DATA_FORMAT = 0; + +let Inst{31-0} = Word0; +let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 1; +} + +multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, rci:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, (i32 rci:$src0))]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, rcf:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, (f32 rcf:$src0))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} + +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// separate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { + def BRANCH : ILFormat<(outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>; +} + +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + +def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), + "IF_PREDICATE_SET $src", []>; + +let isTerminator=1 in { + def BREAK : ILFormat< (outs), (ins), + "BREAK", []>; + def CONTINUE : ILFormat< (outs), (ins), + "CONTINUE", []>; + def DEFAULT : ILFormat< (outs), (ins), + "DEFAULT", []>; + def ELSE : ILFormat< (outs), (ins), + "ELSE", []>; + def ENDSWITCH : ILFormat< (outs), (ins), + "ENDSWITCH", []>; + def ENDMAIN : ILFormat< (outs), (ins), + "ENDMAIN", []>; + def END : ILFormat< (outs), (ins), + "END", []>; + def ENDFUNC : ILFormat< (outs), (ins), + "ENDFUNC", []>; + def ENDIF : ILFormat< (outs), (ins), + "ENDIF", []>; + def WHILELOOP : ILFormat< (outs), (ins), + "WHILE", []>; + def ENDLOOP : ILFormat< (outs), (ins), + "ENDLOOP", []>; + def FUNC : ILFormat< (outs), (ins), + "FUNC", []>; + def RETDYN : ILFormat< (outs), (ins), + "RET_DYN", []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; + defm IFC : BranchInstr2<"IFC">; + defm BREAKC : BranchInstr2<"BREAKC">; + defm CONTINUEC : BranchInstr2<"CONTINUEC">; +} + +//===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical <RegisterClass vec_rc> : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical <RegisterClass vec_rc> : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>; +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>; + +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; + +class ExtractVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; + +class InsertVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; + +//===----------------------------------------------------------------------===// +// ISel Patterns +//===----------------------------------------------------------------------===// + +// CND*_INT Pattterns for f32 True / False values + +class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < + (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), + (cnd $src0, $src1, $src2) +>; + +def : CND_INT_f32 <CNDE_INT, SETEQ>; +def : CND_INT_f32 <CNDGT_INT, SETGT>; +def : CND_INT_f32 <CNDGE_INT, SETGE>; + +//CNDGE_INT extra pattern +def : Pat < + (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), + (CNDGE_INT $src0, $src1, $src2) +>; + +// KIL Patterns +def KILP : Pat < + (int_AMDGPU_kilp), + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) +>; + +def KIL : Pat < + (int_AMDGPU_kill f32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), $src0)) +>; + +def : Extract_Element <f32, v4f32, 0, sub0>; +def : Extract_Element <f32, v4f32, 1, sub1>; +def : Extract_Element <f32, v4f32, 2, sub2>; +def : Extract_Element <f32, v4f32, 3, sub3>; + +def : Insert_Element <f32, v4f32, 0, sub0>; +def : Insert_Element <f32, v4f32, 1, sub1>; +def : Insert_Element <f32, v4f32, 2, sub2>; +def : Insert_Element <f32, v4f32, 3, sub3>; + +def : Extract_Element <i32, v4i32, 0, sub0>; +def : Extract_Element <i32, v4i32, 1, sub1>; +def : Extract_Element <i32, v4i32, 2, sub2>; +def : Extract_Element <i32, v4i32, 3, sub3>; + +def : Insert_Element <i32, v4i32, 0, sub0>; +def : Insert_Element <i32, v4i32, 1, sub1>; +def : Insert_Element <i32, v4i32, 2, sub2>; +def : Insert_Element <i32, v4i32, 3, sub3>; + +def : Extract_Element <f32, v2f32, 0, sub0>; +def : Extract_Element <f32, v2f32, 1, sub1>; + +def : Insert_Element <f32, v2f32, 0, sub0>; +def : Insert_Element <f32, v2f32, 1, sub1>; + +def : Extract_Element <i32, v2i32, 0, sub0>; +def : Extract_Element <i32, v2i32, 1, sub1>; + +def : Insert_Element <i32, v2i32, 0, sub0>; +def : Insert_Element <i32, v2i32, 1, sub1>; + +// bitconvert patterns + +def : BitConvert <i32, f32, R600_Reg32>; +def : BitConvert <f32, i32, R600_Reg32>; +def : BitConvert <v2f32, v2i32, R600_Reg64>; +def : BitConvert <v2i32, v2f32, R600_Reg64>; +def : BitConvert <v4f32, v4i32, R600_Reg128>; +def : BitConvert <v4i32, v4f32, R600_Reg128>; + +// DWORDADDR pattern +def : DwordAddrPat <i32, R600_Reg32>; + +} // End isR600toCayman Predicate + +let Predicates = [isR600] in { +// Intrinsic patterns +defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>; +defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>; +} // End isR600 + +def getLDSNoRetOp : InstrMapping { + let FilterClass = "R600_LDS_1A1D"; + let RowFields = ["BaseOp"]; + let ColFields = ["DisableEncoding"]; + let KeyCol = ["$dst"]; + let ValueCols = [[""""]]; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td new file mode 100644 index 0000000..9681747 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td @@ -0,0 +1,75 @@ +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "R600", isTarget = 1 in { + class TextureIntrinsicFloatInput : + Intrinsic<[llvm_v4f32_ty], [ + llvm_v4f32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + class TextureIntrinsicInt32Input : + Intrinsic<[llvm_v4i32_ty], [ + llvm_v4i32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + + def int_R600_load_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_const : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_R600_interp_xy : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; +def int_R600_interp_zw : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_R600_load_texbuf : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_tex : TextureIntrinsicFloatInput; + def int_R600_texc : TextureIntrinsicFloatInput; + def int_R600_txl : TextureIntrinsicFloatInput; + def int_R600_txlc : TextureIntrinsicFloatInput; + def int_R600_txb : TextureIntrinsicFloatInput; + def int_R600_txbc : TextureIntrinsicFloatInput; + def int_R600_txf : TextureIntrinsicInt32Input; + def int_R600_ldptr : TextureIntrinsicInt32Input; + def int_R600_txq : TextureIntrinsicInt32Input; + def int_R600_ddx : TextureIntrinsicFloatInput; + def int_R600_ddy : TextureIntrinsicFloatInput; + def int_R600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_stream_output : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_pixel_depth : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_pixel_stencil : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_dummy : + Intrinsic<[], [llvm_i32_ty], []>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp new file mode 100644 index 0000000..01105c6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -0,0 +1,20 @@ +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + + +// Pin the vtable to this file. +void R600MachineFunctionInfo::anchor() {} + +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF) { } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h new file mode 100644 index 0000000..f5556c1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -0,0 +1,34 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include <vector> + +namespace llvm { + +class R600MachineFunctionInfo : public AMDGPUMachineFunction { + void anchor() override; +public: + R600MachineFunctionInfo(const MachineFunction &MF); + SmallVector<unsigned, 4> LiveOuts; + std::vector<unsigned> IndirectRegs; + unsigned StackSize; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp new file mode 100644 index 0000000..bcde5fb --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -0,0 +1,469 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "R600MachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); + DAG = static_cast<ScheduleDAGMILive*>(dag); + const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>(); + TII = static_cast<const R600InstrInfo*>(DAG->TII); + TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); + VLIW5 = !ST.hasCaymanISA(); + MRI = &DAG->MRI; + CurInstKind = IDOther; + CurEmitted = 0; + OccupedSlotsMask = 31; + InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); + InstKindLimit[IDOther] = 32; + InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); + AluInstCount = 0; + FetchInstCount = 0; +} + +void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, + std::vector<SUnit *> &QDst) +{ + QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); + QSrc.clear(); +} + +static +unsigned getWFCountLimitedByGPR(unsigned GPRCount) { + assert (GPRCount && "GPRCount cannot be 0"); + return 248 / GPRCount; +} + +SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { + SUnit *SU = nullptr; + NextInstKind = IDOther; + + IsTopNode = false; + + // check if we might want to switch current clause type + bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || + (Available[CurInstKind].empty()); + bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && + (!Available[IDFetch].empty() || !Available[IDOther].empty()); + + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { + // We use the heuristic provided by AMD Accelerated Parallel Processing + // OpenCL Programming Guide : + // The approx. number of WF that allows TEX inst to hide ALU inst is : + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) + float ALUFetchRationEstimate = + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / + (FetchInstCount + Available[IDFetch].size()); + if (ALUFetchRationEstimate == 0) { + AllowSwitchFromAlu = true; + } else { + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } + } + + if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || + (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { + // try to pick ALU + SU = pickAlu(); + if (!SU && !PhysicalRegCopy.empty()) { + SU = PhysicalRegCopy.front(); + PhysicalRegCopy.erase(PhysicalRegCopy.begin()); + } + if (SU) { + if (CurEmitted >= InstKindLimit[IDAlu]) + CurEmitted = 0; + NextInstKind = IDAlu; + } + } + + if (!SU) { + // try to pick FETCH + SU = pickOther(IDFetch); + if (SU) + NextInstKind = IDFetch; + } + + // try to pick other + if (!SU) { + SU = pickOther(IDOther); + if (SU) + NextInstKind = IDOther; + } + + DEBUG( + if (SU) { + dbgs() << " ** Pick node **\n"; + SU->dump(DAG); + } else { + dbgs() << "NO NODE \n"; + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + } + ); + + return SU; +} + +void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (NextInstKind != CurInstKind) { + DEBUG(dbgs() << "Instruction Type Switch\n"); + if (NextInstKind != IDAlu) + OccupedSlotsMask |= 31; + CurEmitted = 0; + CurInstKind = NextInstKind; + } + + if (CurInstKind == IDAlu) { + AluInstCount ++; + switch (getAluKind(SU)) { + case AluT_XYZW: + CurEmitted += 4; + break; + case AluDiscarded: + break; + default: { + ++CurEmitted; + for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), + E = SU->getInstr()->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++CurEmitted; + } + } + } + } else { + ++CurEmitted; + } + + + DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + + if (CurInstKind != IDFetch) { + MoveUnits(Pending[IDFetch], Available[IDFetch]); + } else + FetchInstCount++; +} + +static bool +isPhysicalRegCopy(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::COPY) + return false; + + return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); +} + +void R600SchedStrategy::releaseTopNode(SUnit *SU) { + DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); +} + +void R600SchedStrategy::releaseBottomNode(SUnit *SU) { + DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); + if (isPhysicalRegCopy(SU->getInstr())) { + PhysicalRegCopy.push_back(SU); + return; + } + + int IK = getInstKind(SU); + + // There is no export clause, we can schedule one as soon as its ready + if (IK == IDOther) + Available[IDOther].push_back(SU); + else + Pending[IK].push_back(SU); + +} + +bool R600SchedStrategy::regBelongsToClass(unsigned Reg, + const TargetRegisterClass *RC) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->contains(Reg); + } else { + return MRI->getRegClass(Reg) == RC; + } +} + +R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + + if (TII->isTransOnly(MI)) + return AluTrans; + + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + return AluPredX; + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return AluT_XYZW; + case AMDGPU::COPY: + if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; + } + default: + break; + } + + // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + return AluT_XYZW; + } + + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } + + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + // LDS src registers cannot be used in the Trans slot. + if (TII->readsLDSSrcReg(MI)) + return AluT_XYZW; + + return AluAny; + +} + +int R600SchedStrategy::getInstKind(SUnit* SU) { + int Opcode = SU->getInstr()->getOpcode(); + + if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) + return IDFetch; + + if (TII->isALUInstr(Opcode)) { + return IDAlu; + } + + switch (Opcode) { + case AMDGPU::PRED_X: + case AMDGPU::COPY: + case AMDGPU::CONST_COPY: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return IDAlu; + default: + return IDOther; + } +} + +SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { + if (Q.empty()) + return nullptr; + for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); + It != E; ++It) { + SUnit *SU = *It; + InstructionsGroupCandidate.push_back(SU->getInstr()); + if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) + && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) + ) { + InstructionsGroupCandidate.pop_back(); + Q.erase((It + 1).base()); + return SU; + } else { + InstructionsGroupCandidate.pop_back(); + } + } + return nullptr; +} + +void R600SchedStrategy::LoadAlu() { + std::vector<SUnit *> &QSrc = Pending[IDAlu]; + for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { + AluKind AK = getAluKind(QSrc[i]); + AvailableAlus[AK].push_back(QSrc[i]); + } + QSrc.clear(); +} + +void R600SchedStrategy::PrepareNextSlot() { + DEBUG(dbgs() << "New Slot\n"); + assert (OccupedSlotsMask && "Slot wasn't filled"); + OccupedSlotsMask = 0; +// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) +// OccupedSlotsMask |= 16; + InstructionsGroupCandidate.clear(); + LoadAlu(); +} + +void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { + int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (DstIndex == -1) { + return; + } + unsigned DestReg = MI->getOperand(DstIndex).getReg(); + // PressureRegister crashes if an operand is def and used in the same inst + // and we try to constraint its regclass + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && !MO.isDef() && + MO.getReg() == DestReg) + return; + } + // Constrains the regclass of DestReg to assign it to Slot + switch (Slot) { + case 0: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + break; + case 1: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + break; + case 2: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + break; + case 3: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + break; + } +} + +SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { + static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; + SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); + if (SlotedSU) + return SlotedSU; + SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); + if (UnslotedSU) + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; +} + +unsigned R600SchedStrategy::AvailablesAluCount() const { + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + + AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + + AvailableAlus[AluPredX].size(); +} + +SUnit* R600SchedStrategy::pickAlu() { + while (AvailablesAluCount() || !Pending[IDAlu].empty()) { + if (!OccupedSlotsMask) { + // Bottom up scheduling : predX must comes first + if (!AvailableAlus[AluPredX].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluPredX], false); + } + // Flush physical reg copies (RA will discard them) + if (!AvailableAlus[AluDiscarded].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluDiscarded], false); + } + // If there is a T_XYZW alu available, use it + if (!AvailableAlus[AluT_XYZW].empty()) { + OccupedSlotsMask |= 15; + return PopInst(AvailableAlus[AluT_XYZW], false); + } + } + bool TransSlotOccuped = OccupedSlotsMask & 16; + if (!TransSlotOccuped && VLIW5) { + if (!AvailableAlus[AluTrans].empty()) { + OccupedSlotsMask |= 16; + return PopInst(AvailableAlus[AluTrans], false); + } + SUnit *SU = AttemptFillSlot(3, true); + if (SU) { + OccupedSlotsMask |= 16; + return SU; + } + } + for (int Chan = 3; Chan > -1; --Chan) { + bool isOccupied = OccupedSlotsMask & (1 << Chan); + if (!isOccupied) { + SUnit *SU = AttemptFillSlot(Chan, false); + if (SU) { + OccupedSlotsMask |= (1 << Chan); + InstructionsGroupCandidate.push_back(SU->getInstr()); + return SU; + } + } + } + PrepareNextSlot(); + } + return nullptr; +} + +SUnit* R600SchedStrategy::pickOther(int QID) { + SUnit *SU = nullptr; + std::vector<SUnit *> &AQ = Available[QID]; + + if (AQ.empty()) { + MoveUnits(Pending[QID], AQ); + } + if (!AQ.empty()) { + SU = AQ.back(); + AQ.resize(AQ.size() - 1); + } + return SU; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h new file mode 100644 index 0000000..fc5b95c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h @@ -0,0 +1,103 @@ +//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H + +#include "R600InstrInfo.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace llvm { + +class R600SchedStrategy : public MachineSchedStrategy { + + const ScheduleDAGMILive *DAG; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + MachineRegisterInfo *MRI; + + enum InstKind { + IDAlu, + IDFetch, + IDOther, + IDLast + }; + + enum AluKind { + AluAny, + AluT_X, + AluT_Y, + AluT_Z, + AluT_W, + AluT_XYZW, + AluPredX, + AluTrans, + AluDiscarded, // LLVM Instructions that are going to be eliminated + AluLast + }; + + std::vector<SUnit *> Available[IDLast], Pending[IDLast]; + std::vector<SUnit *> AvailableAlus[AluLast]; + std::vector<SUnit *> PhysicalRegCopy; + + InstKind CurInstKind; + int CurEmitted; + InstKind NextInstKind; + + unsigned AluInstCount; + unsigned FetchInstCount; + + int InstKindLimit[IDLast]; + + int OccupedSlotsMask; + +public: + R600SchedStrategy() : + DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { + } + + virtual ~R600SchedStrategy() {} + + void initialize(ScheduleDAGMI *dag) override; + SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void releaseTopNode(SUnit *SU) override; + void releaseBottomNode(SUnit *SU) override; + +private: + std::vector<MachineInstr *> InstructionsGroupCandidate; + bool VLIW5; + + int getInstKind(SUnit *SU); + bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; + AluKind getAluKind(SUnit *SU) const; + void LoadAlu(); + unsigned AvailablesAluCount() const; + SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu); + void PrepareNextSlot(); + SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU); + + void AssignSlot(MachineInstr *MI, unsigned Slot); + SUnit* pickAlu(); + SUnit* pickOther(int QID); + void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst); +}; + +} // namespace llvm + +#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp new file mode 100644 index 0000000..a1a1b40 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -0,0 +1,382 @@ +//===--------------------- R600MergeVectorRegisters.cpp -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass merges inputs of swizzeable instructions into vector sharing +/// common data and/or have enough undef subreg using swizzle abilities. +/// +/// For instance let's consider the following pseudo code : +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 +/// +/// is turned into : +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7<def> = INSERT_SUBREG vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 +/// +/// This allow regalloc to reduce register pressure for vector registers and +/// to reduce MOV count. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "vec-merger" + +namespace { + +static bool +isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { + for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), + E = MRI.def_instr_end(); It != E; ++It) { + return (*It).isImplicitDef(); + } + if (MRI.isReserved(Reg)) { + return false; + } + llvm_unreachable("Reg without a def"); + return false; +} + +class RegSeqInfo { +public: + MachineInstr *Instr; + DenseMap<unsigned, unsigned> RegToChan; + std::vector<unsigned> UndefReg; + RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { + assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); + for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { + MachineOperand &MO = Instr->getOperand(i); + unsigned Chan = Instr->getOperand(i + 1).getImm(); + if (isImplicitlyDef(MRI, MO.getReg())) + UndefReg.push_back(Chan); + else + RegToChan[MO.getReg()] = Chan; + } + } + RegSeqInfo() {} + + bool operator==(const RegSeqInfo &RSI) const { + return RSI.Instr == Instr; + } +}; + +class R600VectorRegMerger : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const R600InstrInfo *TII; + bool canSwizzle(const MachineInstr &) const; + bool areAllUsesSwizzeable(unsigned Reg) const; + void SwizzleInput(MachineInstr &, + const std::vector<std::pair<unsigned, unsigned> > &) const; + bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, + std::vector<std::pair<unsigned, unsigned> > &Remap) const; + bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan); + bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan); + MachineInstr *RebuildVector(RegSeqInfo *MI, + const RegSeqInfo *BaseVec, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const; + void RemoveMI(MachineInstr *); + void trackRSI(const RegSeqInfo &RSI); + + typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap; + DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq; + InstructionSetMap PreviousRegSeqByReg; + InstructionSetMap PreviousRegSeqByUndefCount; +public: + static char ID; + R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Vector Registers Merge Pass"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; + +char R600VectorRegMerger::ID = 0; + +bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) + const { + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + return true; + switch (MI.getOpcode()) { + case AMDGPU::R600_ExportSwz: + case AMDGPU::EG_ExportSwz: + return true; + default: + return false; + } +} + +bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, + RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap) + const { + unsigned CurrentUndexIdx = 0; + for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(), + E = ToMerge->RegToChan.end(); It != E; ++It) { + DenseMap<unsigned, unsigned>::const_iterator PosInUntouched = + Untouched->RegToChan.find((*It).first); + if (PosInUntouched != Untouched->RegToChan.end()) { + Remap.push_back(std::pair<unsigned, unsigned> + ((*It).second, (*PosInUntouched).second)); + continue; + } + if (CurrentUndexIdx >= Untouched->UndefReg.size()) + return false; + Remap.push_back(std::pair<unsigned, unsigned> + ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); + } + + return true; +} + +static +unsigned getReassignedChan( + const std::vector<std::pair<unsigned, unsigned> > &RemapChan, + unsigned Chan) { + for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { + if (RemapChan[j].first == Chan) + return RemapChan[j].second; + } + llvm_unreachable("Chan wasn't reassigned"); +} + +MachineInstr *R600VectorRegMerger::RebuildVector( + RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + unsigned Reg = RSI->Instr->getOperand(0).getReg(); + MachineBasicBlock::iterator Pos = RSI->Instr; + MachineBasicBlock &MBB = *Pos->getParent(); + DebugLoc DL = Pos->getDebugLoc(); + + unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); + DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; + std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; + for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), + E = RSI->RegToChan.end(); It != E; ++It) { + unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned SubReg = (*It).first; + unsigned Swizzle = (*It).second; + unsigned Chan = getReassignedChan(RemapChan, Swizzle); + + MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), + DstReg) + .addReg(SrcVec) + .addReg(SubReg) + .addImm(Chan); + UpdatedRegToChan[SubReg] = Chan; + std::vector<unsigned>::iterator ChanPos = + std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); + if (ChanPos != UpdatedUndef.end()) + UpdatedUndef.erase(ChanPos); + assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == + UpdatedUndef.end() && + "UpdatedUndef shouldn't contain Chan more than once!"); + DEBUG(dbgs() << " ->"; Tmp->dump();); + (void)Tmp; + SrcVec = DstReg; + } + Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) + .addReg(SrcVec); + DEBUG(dbgs() << " ->"; Pos->dump();); + + DEBUG(dbgs() << " Updating Swizzle:\n"); + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); + SwizzleInput(*It, RemapChan); + DEBUG((*It).dump()); + } + RSI->Instr->eraseFromParent(); + + // Update RSI + RSI->Instr = Pos; + RSI->RegToChan = UpdatedRegToChan; + RSI->UndefReg = UpdatedUndef; + + return Pos; +} + +void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { + for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), + E = PreviousRegSeqByReg.end(); It != E; ++It) { + std::vector<MachineInstr *> &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } + for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), + E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { + std::vector<MachineInstr *> &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } +} + +void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, + const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const { + unsigned Offset; + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + Offset = 2; + else + Offset = 3; + for (unsigned i = 0; i < 4; i++) { + unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; + for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { + if (RemapChan[j].first == Swizzle) { + MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); + break; + } + } + } +} + +bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + if (!canSwizzle(*It)) + return false; + } + return true; +} + +bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), + MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { + if (!MOp->isReg()) + continue; + if (PreviousRegSeqByReg[MOp->getReg()].empty()) + continue; + for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { + CompatibleRSI = PreviousRegSeq[MI]; + if (RSI == CompatibleRSI) + continue; + if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) + return true; + } + } + return false; +} + +bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector<std::pair<unsigned, unsigned> > &RemapChan) { + unsigned NeededUndefs = 4 - RSI.UndefReg.size(); + if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) + return false; + std::vector<MachineInstr *> &MIs = + PreviousRegSeqByUndefCount[NeededUndefs]; + CompatibleRSI = PreviousRegSeq[MIs.back()]; + tryMergeVector(&CompatibleRSI, &RSI, RemapChan); + return true; +} + +void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { + for (DenseMap<unsigned, unsigned>::const_iterator + It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { + PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); + } + PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr); + PreviousRegSeq[RSI.Instr] = RSI; +} + +bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo()); + MRI = &(Fn.getRegInfo()); + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock *MB = MBB; + PreviousRegSeq.clear(); + PreviousRegSeqByReg.clear(); + PreviousRegSeqByUndefCount.clear(); + + for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); + MII != MIIE; ++MII) { + MachineInstr *MI = MII; + if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { + if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { + unsigned Reg = MI->getOperand(1).getReg(); + for (MachineRegisterInfo::def_instr_iterator + It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); + It != E; ++It) { + RemoveMI(&(*It)); + } + } + continue; + } + + + RegSeqInfo RSI(*MRI, MI); + + // All uses of MI are swizzeable ? + unsigned Reg = MI->getOperand(0).getReg(); + if (!areAllUsesSwizzeable(Reg)) + continue; + + DEBUG (dbgs() << "Trying to optimize "; + MI->dump(); + ); + + RegSeqInfo CandidateRSI; + std::vector<std::pair<unsigned, unsigned> > RemapChan; + DEBUG(dbgs() << "Using common slots...\n";); + if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { + // Remove CandidateRSI mapping + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + DEBUG(dbgs() << "Using free slots...\n";); + RemapChan.clear(); + if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + //Failed to merge + trackRSI(RSI); + } + } + return false; +} + +} // namespace + +llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { + return new R600VectorRegMerger(tm); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp new file mode 100644 index 0000000..deee5bc --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -0,0 +1,408 @@ +//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass implements instructions packetization for R600. It unsets isLast +/// bit of instructions inside a bundle and substitutes src register with +/// PreviousVector when applicable. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "packets" + +namespace { + +class R600Packetizer : public MachineFunctionPass { + +public: + static char ID; + R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; +char R600Packetizer::ID = 0; + +class R600PacketizerList : public VLIWPacketizerList { + +private: + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + bool VLIW5; + bool ConsideredInstUsesAlreadyWrittenVectorElement; + + unsigned getSlot(const MachineInstr *MI) const { + return TRI.getHWRegChan(MI->getOperand(0).getReg()); + } + + /// \returns register to PV chan mapping for bundle/single instructions that + /// immediately precedes I. + DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) + const { + DenseMap<unsigned, unsigned> Result; + I--; + if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) + return Result; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + if (I->isBundle()) + BI++; + int LastDstChan = -1; + do { + bool isTrans = false; + int BISlot = getSlot(BI); + if (LastDstChan >= BISlot) + isTrans = true; + LastDstChan = BISlot; + if (TII->isPredicated(BI)) + continue; + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); + if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) + continue; + int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + if (DstIdx == -1) { + continue; + } + unsigned Dst = BI->getOperand(DstIdx).getReg(); + if (isTrans || TII->isTransOnly(BI)) { + Result[Dst] = AMDGPU::PS; + continue; + } + if (BI->getOpcode() == AMDGPU::DOT4_r600 || + BI->getOpcode() == AMDGPU::DOT4_eg) { + Result[Dst] = AMDGPU::PV_X; + continue; + } + if (Dst == AMDGPU::OQAP) { + continue; + } + unsigned PVReg = 0; + switch (TRI.getHWRegChan(Dst)) { + case 0: + PVReg = AMDGPU::PV_X; + break; + case 1: + PVReg = AMDGPU::PV_Y; + break; + case 2: + PVReg = AMDGPU::PV_Z; + break; + case 3: + PVReg = AMDGPU::PV_W; + break; + default: + llvm_unreachable("Invalid Chan"); + } + Result[Dst] = PVReg; + } while ((++BI)->isBundledWithPred()); + return Result; + } + + void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs) + const { + unsigned Ops[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0) + continue; + unsigned Src = MI->getOperand(OperandIdx).getReg(); + const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); + if (It != PVs.end()) + MI->getOperand(OperandIdx).setReg(It->second); + } + } +public: + // Ctor. + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + : VLIWPacketizerList(MF, MLI, true), + TII(static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { + VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + } + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() override { + ConsideredInstUsesAlreadyWrittenVectorElement = false; + } + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(MachineInstr *MI, + MachineBasicBlock *MBB) override { + return false; + } + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(MachineInstr *MI) override { + if (TII->isVector(*MI)) + return true; + if (!TII->isALUInstr(MI->getOpcode())) + return true; + if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + return true; + // XXX: This can be removed once the packetizer properly handles all the + // LDS instruction group restrictions. + if (TII->isLDSInstr(MI->getOpcode())) + return true; + return false; + } + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { + MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); + if (getSlot(MII) == getSlot(MIJ)) + ConsideredInstUsesAlreadyWrittenVectorElement = true; + // Does MII and MIJ share the same pred_sel ? + int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); + unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + if (PredI != PredJ) + return false; + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { + const SDep &Dep = SUJ->Succs[i]; + if (Dep.getSUnit() != SUI) + continue; + if (Dep.getKind() == SDep::Anti) + continue; + if (Dep.getKind() == SDep::Output) + if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) + continue; + return false; + } + } + + bool ARDef = TII->definesAddressRegister(MII) || + TII->definesAddressRegister(MIJ); + bool ARUse = TII->usesAddressRegister(MII) || + TII->usesAddressRegister(MIJ); + if (ARDef && ARUse) + return false; + + return true; + } + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { + return false; + } + + void setIsLastBit(MachineInstr *MI, unsigned Bit) const { + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); + MI->getOperand(LastOp).setImm(Bit); + } + + bool isBundlableWithCurrentPMI(MachineInstr *MI, + const DenseMap<unsigned, unsigned> &PV, + std::vector<R600InstrInfo::BankSwizzle> &BS, + bool &isTransSlot) { + isTransSlot = TII->isTransOnly(MI); + assert (!isTransSlot || VLIW5); + + // Is the dst reg sequence legal ? + if (!isTransSlot && !CurrentPacketMIs.empty()) { + if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { + if (ConsideredInstUsesAlreadyWrittenVectorElement && + !TII->isVectorOnly(MI) && VLIW5) { + isTransSlot = true; + DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); + } + else + return false; + } + } + + // Are the Constants limitations met ? + CurrentPacketMIs.push_back(MI); + if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Consts read limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // Is there a BankSwizzle set that meet Read Port limitations ? + if (!TII->fitsReadPortLimitations(CurrentPacketMIs, + PV, BS, isTransSlot)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Read port limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // We cannot read LDS source registrs from the Trans slot. + if (isTransSlot && TII->readsLDSSrcReg(MI)) + return false; + + CurrentPacketMIs.pop_back(); + return true; + } + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { + MachineBasicBlock::iterator FirstInBundle = + CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); + const DenseMap<unsigned, unsigned> &PV = + getPreviousVector(FirstInBundle); + std::vector<R600InstrInfo::BankSwizzle> BS; + bool isTransSlot; + + if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { + for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { + MachineInstr *MI = CurrentPacketMIs[i]; + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS[i]); + } + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS.back()); + if (!CurrentPacketMIs.empty()) + setIsLastBit(CurrentPacketMIs.back(), 0); + substitutePV(MI, PV); + MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); + if (isTransSlot) { + endPacket(std::next(It)->getParent(), std::next(It)); + } + return It; + } + endPacket(MI->getParent(), MI); + if (TII->isTransOnly(MI)) + return MI; + return VLIWPacketizerList::addToPacket(MI); + } +}; + +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + + // Instantiate the packetizer. + R600PacketizerList Packetizer(Fn, MLI); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || + (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = std::prev(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == std::prev(RegionEnd)) { + RegionEnd = std::prev(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(MBB, I, RegionEnd); + RegionEnd = I; + } + } + + return true; + +} + +} // end anonymous namespace + +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { + return new R600Packetizer(tm); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp new file mode 100644 index 0000000..fb0359c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -0,0 +1,91 @@ +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "R600RegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + +R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { + RCW.RegWeight = 0; + RCW.WeightLimit = 0; +} + +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + Reserved.set(AMDGPU::ZERO); + Reserved.set(AMDGPU::HALF); + Reserved.set(AMDGPU::ONE); + Reserved.set(AMDGPU::ONE_INT); + Reserved.set(AMDGPU::NEG_HALF); + Reserved.set(AMDGPU::NEG_ONE); + Reserved.set(AMDGPU::PV_X); + Reserved.set(AMDGPU::ALU_LITERAL_X); + Reserved.set(AMDGPU::ALU_CONST); + Reserved.set(AMDGPU::PREDICATE_BIT); + Reserved.set(AMDGPU::PRED_SEL_OFF); + Reserved.set(AMDGPU::PRED_SEL_ZERO); + Reserved.set(AMDGPU::PRED_SEL_ONE); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + TII->reserveIndirectRegisters(Reserved, MF); + + return Reserved; +} + +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { + return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const { + return GET_REG_INDEX(getEncodingValue(Reg)); +} + +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + } +} + +const RegClassWeight &R600RegisterInfo::getRegClassWeight( + const TargetRegisterClass *RC) const { + return RCW; +} + +bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + switch (Reg) { + case AMDGPU::OQAP: + case AMDGPU::OQBP: + case AMDGPU::AR_X: + return false; + default: + return true; + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h new file mode 100644 index 0000000..9713e60 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -0,0 +1,49 @@ +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600RegisterInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H + +#include "AMDGPURegisterInfo.h" + +namespace llvm { + +class AMDGPUSubtarget; + +struct R600RegisterInfo : public AMDGPURegisterInfo { + RegClassWeight RCW; + + R600RegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + /// \brief get the HW encoding for a register's channel. + unsigned getHWRegChan(unsigned reg) const; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + + const RegClassWeight & + getRegClassWeight(const TargetRegisterClass *RC) const override; + + // \returns true if \p Reg can be defined in one ALU caluse and used in another. + bool isPhysRegLiveAcrossClauses(unsigned Reg) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td new file mode 100644 index 0000000..cc667d9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td @@ -0,0 +1,252 @@ + +class R600Reg <string name, bits<16> encoding> : Register<name> { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +class R600RegWithChan <string name, bits<9> sel, string chan> : + Register <name> { + + field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, + !if(!eq(chan, "Y"), 1, + !if(!eq(chan, "Z"), 2, + !if(!eq(chan, "W"), 3, 0)))); + let HWEncoding{8-0} = sel; + let HWEncoding{10-9} = chan_encoding; + let Namespace = "AMDGPU"; +} + +class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1, sub2, sub3]; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)], + lo +>; + +foreach Index = 0-127 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + } + // 128-bit Temporary Registers + def T#Index#_XYZW : R600Reg_128 <"T"#Index#"", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y"), + !cast<Register>("T"#Index#"_Z"), + !cast<Register>("T"#Index#"_W")], + Index>; + + def T#Index#_XY : R600Reg_64 <"T"#Index#"", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y")], + Index>; +} + +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast<Register>("T0_"#Chan), + !cast<Register>("T1_"#Chan), + !cast<Register>("T2_"#Chan), + !cast<Register>("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + +// KCACHE_BANK0 +foreach Index = 159-128 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW", + [!cast<Register>("KC0_"#Index#"_X"), + !cast<Register>("KC0_"#Index#"_Y"), + !cast<Register>("KC0_"#Index#"_Z"), + !cast<Register>("KC0_"#Index#"_W")], + Index>; +} + +// KCACHE_BANK1 +foreach Index = 191-160 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW", + [!cast<Register>("KC1_"#Index#"_X"), + !cast<Register>("KC1_"#Index#"_Y"), + !cast<Register>("KC1_"#Index#"_Z"), + !cast<Register>("KC1_"#Index#"_W")], + Index>; +} + + +// Array Base Register holding input in FS +foreach Index = 448-480 in { + def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; +} + + +// Special Registers + +def OQA : R600Reg<"OQA", 219>; +def OQB : R600Reg<"OQB", 220>; +def OQAP : R600Reg<"OQAP", 221>; +def OQBP : R600Reg<"OQAP", 222>; +def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>; +def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>; +def ZERO : R600Reg<"0.0", 248>; +def ONE : R600Reg<"1.0", 249>; +def NEG_ONE : R600Reg<"-1.0", 249>; +def ONE_INT : R600Reg<"1", 250>; +def HALF : R600Reg<"0.5", 252>; +def NEG_HALF : R600Reg<"-0.5", 252>; +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; +def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; +def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; +def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; +def PV_X : R600RegWithChan<"PV.X", 254, "X">; +def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">; +def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">; +def PV_W : R600RegWithChan<"PV.W", 254, "W">; +def PS: R600Reg<"PS", 255>; +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; + +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "ArrayBase%u", 448, 480))>; +// special registers for ALU src operands +// const buffer reference, SRCx_SEL contains index +def ALU_CONST : R600Reg<"CBuf", 0>; +// interpolation param reference, SRCx_SEL contains index +def ALU_PARAM : R600Reg<"Param", 0>; + +let isAllocatable = 0 in { + +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; + +def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, + (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; + +def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_X", 128, 159))>; + +def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Y", 128, 159))>; + +def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Z", 128, 159))>; + +def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_W", 128, 159))>; + +def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC0_X, R600_KC0_Y, + R600_KC0_Z, R600_KC0_W)>; + +def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_X", 160, 191))>; + +def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Y", 160, 191))>; + +def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Z", 160, 191))>; + +def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_W", 160, 191))>; + +def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC1_X, R600_KC1_Y, + R600_KC1_Z, R600_KC1_W)>; + +} // End isAllocatable = 0 + +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_X", 0, 127), AR_X)>; + +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Y", 0, 127))>; + +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Z", 0, 127))>; + +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_W", 0, 127))>; + +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_TReg32_X, R600_TReg32_Y, + R600_TReg32_Z, R600_TReg32_W)>; + +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add + R600_TReg32, + R600_ArrayBase, + R600_Addr, + R600_KC0, R600_KC1, + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, + ALU_CONST, ALU_PARAM, OQAP + )>; + +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add + PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; + +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add + PREDICATE_BIT)>; + +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add (sequence "T%u_XYZW", 0, 127))> { + let CopyCost = -1; +} + +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td new file mode 100644 index 0000000..df62bf8 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td @@ -0,0 +1,49 @@ +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS +// slot has been removed. +// +//===----------------------------------------------------------------------===// + + +def ALU_X : FuncUnit; +def ALU_Y : FuncUnit; +def ALU_Z : FuncUnit; +def ALU_W : FuncUnit; +def TRANS : FuncUnit; + +def AnyALU : InstrItinClass; +def VecALU : InstrItinClass; +def TransALU : InstrItinClass; +def XALU : InstrItinClass; + +def R600_VLIW5_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>, + InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; + +def R600_VLIW4_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp new file mode 100644 index 0000000..93bcf68 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp @@ -0,0 +1,303 @@ +//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass translates tgsi-like texture intrinsics into R600 texture +/// closer to hardware intrinsics. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { +class R600TextureIntrinsicsReplacer : + public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> { + static char ID; + + Module *Mod; + Type *FloatType; + Type *Int32Type; + Type *V4f32Type; + Type *V4i32Type; + FunctionType *TexSign; + FunctionType *TexQSign; + + void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, + unsigned SrcSelect[4], unsigned CT[4], + bool &useShadowVariant) { + enum TextureTypes { + TEXTURE_1D = 1, + TEXTURE_2D, + TEXTURE_3D, + TEXTURE_CUBE, + TEXTURE_RECT, + TEXTURE_SHADOW1D, + TEXTURE_SHADOW2D, + TEXTURE_SHADOWRECT, + TEXTURE_1D_ARRAY, + TEXTURE_2D_ARRAY, + TEXTURE_SHADOW1D_ARRAY, + TEXTURE_SHADOW2D_ARRAY, + TEXTURE_SHADOWCUBE, + TEXTURE_2D_MSAA, + TEXTURE_2D_ARRAY_MSAA, + TEXTURE_CUBE_ARRAY, + TEXTURE_SHADOWCUBE_ARRAY + }; + + switch (TextureType) { + case 0: + useShadowVariant = false; + return; + case TEXTURE_RECT: + case TEXTURE_1D: + case TEXTURE_2D: + case TEXTURE_3D: + case TEXTURE_CUBE: + case TEXTURE_1D_ARRAY: + case TEXTURE_2D_ARRAY: + case TEXTURE_CUBE_ARRAY: + case TEXTURE_2D_MSAA: + case TEXTURE_2D_ARRAY_MSAA: + useShadowVariant = false; + break; + case TEXTURE_SHADOW1D: + case TEXTURE_SHADOW2D: + case TEXTURE_SHADOWRECT: + case TEXTURE_SHADOW1D_ARRAY: + case TEXTURE_SHADOW2D_ARRAY: + case TEXTURE_SHADOWCUBE: + case TEXTURE_SHADOWCUBE_ARRAY: + useShadowVariant = true; + break; + default: + llvm_unreachable("Unknow Texture Type"); + } + + if (TextureType == TEXTURE_RECT || + TextureType == TEXTURE_SHADOWRECT) { + CT[0] = 0; + CT[1] = 0; + } + + if (TextureType == TEXTURE_CUBE_ARRAY || + TextureType == TEXTURE_SHADOWCUBE_ARRAY) + CT[2] = 0; + + if (TextureType == TEXTURE_1D_ARRAY || + TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (hasLOD && useShadowVariant) { + CT[1] = 0; + } else { + CT[2] = 0; + SrcSelect[2] = 1; + } + } else if (TextureType == TEXTURE_2D_ARRAY || + TextureType == TEXTURE_SHADOW2D_ARRAY) { + CT[2] = 0; + } + + if ((TextureType == TEXTURE_SHADOW1D || + TextureType == TEXTURE_SHADOW2D || + TextureType == TEXTURE_SHADOWRECT || + TextureType == TEXTURE_SHADOW1D_ARRAY) && + !(hasLOD && useShadowVariant)) + SrcSelect[3] = 2; + } + + void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, + unsigned SrcSelect[4], Value *Offset[3], Value *Resource, + Value *Sampler, unsigned CT[4], Value *Coord) { + IRBuilder<> Builder(&I); + Constant *Mask[] = { + ConstantInt::get(Int32Type, SrcSelect[0]), + ConstantInt::get(Int32Type, SrcSelect[1]), + ConstantInt::get(Int32Type, SrcSelect[2]), + ConstantInt::get(Int32Type, SrcSelect[3]) + }; + Value *SwizzleMask = ConstantVector::get(Mask); + Value *SwizzledCoord = + Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); + + Value *Args[] = { + SwizzledCoord, + Offset[0], + Offset[1], + Offset[2], + Resource, + Sampler, + ConstantInt::get(Int32Type, CT[0]), + ConstantInt::get(Int32Type, CT[1]), + ConstantInt::get(Int32Type, CT[2]), + ConstantInt::get(Int32Type, CT[3]) + }; + + Function *F = Mod->getFunction(Name); + if (!F) { + F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); + F->addFnAttr(Attribute::ReadNone); + } + I.replaceAllUsesWith(Builder.CreateCall(F, Args)); + I.eraseFromParent(); + } + + void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, + const char *VanillaInt, + const char *ShadowInt) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(1); + Value *SamplerId = I.getArgOperand(2); + + unsigned TextureType = + cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0) + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + + void ReplaceTXF(CallInst &I) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(4); + Value *SamplerId = I.getArgOperand(5); + + unsigned TextureType = + cast<ConstantInt>(I.getArgOperand(6))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + I.getArgOperand(1), + I.getArgOperand(2), + I.getArgOperand(3), + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + +public: + R600TextureIntrinsicsReplacer(): + FunctionPass(ID) { + } + + bool doInitialization(Module &M) override { + LLVMContext &Ctx = M.getContext(); + Mod = &M; + FloatType = Type::getFloatTy(Ctx); + Int32Type = Type::getInt32Ty(Ctx); + V4f32Type = VectorType::get(FloatType, 4); + V4i32Type = VectorType::get(Int32Type, 4); + Type *ArgsType[] = { + V4f32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); + Type *ArgsQType[] = { + V4i32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); + return false; + } + + bool runOnFunction(Function &F) override { + visit(F); + return false; + } + + const char *getPassName() const override { + return "R600 Texture Intrinsics Replacer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + } + + void visitCallInst(CallInst &I) { + if (!I.getCalledFunction()) + return; + + StringRef Name = I.getCalledFunction()->getName(); + if (Name == "llvm.AMDGPU.tex") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); + return; + } + if (Name == "llvm.AMDGPU.txl") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); + return; + } + if (Name == "llvm.AMDGPU.txb") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); + return; + } + if (Name == "llvm.AMDGPU.txf") { + ReplaceTXF(I); + return; + } + if (Name == "llvm.AMDGPU.txq") { + ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); + return; + } + if (Name == "llvm.AMDGPU.ddx") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); + return; + } + if (Name == "llvm.AMDGPU.ddy") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); + return; + } + } + +}; + +char R600TextureIntrinsicsReplacer::ID = 0; + +} // namespace + +FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { + return new R600TextureIntrinsicsReplacer(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td new file mode 100644 index 0000000..613a0d7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td @@ -0,0 +1,21 @@ +//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to R700 and newer VLIW4/VLIW5 GPUs +// - Available only on R700 family GPUs. +// +//===----------------------------------------------------------------------===// + +def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; + +let Predicates = [isR700] in { + def SIN_r700 : SIN_Common<0x6E>; + def COS_r700 : COS_Common<0x6F>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp new file mode 100644 index 0000000..ccfbf1b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -0,0 +1,365 @@ +//===-- SIAnnotateControlFlow.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Annotates the control flow with hardware specific intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-annotate-control-flow" + +namespace { + +// Complex types used in this pass +typedef std::pair<BasicBlock *, Value *> StackEntry; +typedef SmallVector<StackEntry, 16> StackVector; + +// Intrinsic names the control flow is annotated with +static const char *const IfIntrinsic = "llvm.SI.if"; +static const char *const ElseIntrinsic = "llvm.SI.else"; +static const char *const BreakIntrinsic = "llvm.SI.break"; +static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; +static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; +static const char *const LoopIntrinsic = "llvm.SI.loop"; +static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; + +class SIAnnotateControlFlow : public FunctionPass { + + static char ID; + + Type *Boolean; + Type *Void; + Type *Int64; + Type *ReturnStruct; + + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + Constant *Int64Zero; + + Constant *If; + Constant *Else; + Constant *Break; + Constant *IfBreak; + Constant *ElseBreak; + Constant *Loop; + Constant *EndCf; + + DominatorTree *DT; + StackVector Stack; + + LoopInfo *LI; + + bool isTopOfStack(BasicBlock *BB); + + Value *popSaved(); + + void push(BasicBlock *BB, Value *Saved); + + bool isElse(PHINode *Phi); + + void eraseIfUnused(PHINode *Phi); + + void openIf(BranchInst *Term); + + void insertElse(BranchInst *Term); + + Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); + + void handleLoop(BranchInst *Term); + + void closeControlFlow(BasicBlock *BB); + +public: + SIAnnotateControlFlow(): + FunctionPass(ID) { } + + bool doInitialization(Module &M) override; + + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "SI annotate control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char SIAnnotateControlFlow::ID = 0; + +/// \brief Initialize all the types and constants used in the pass +bool SIAnnotateControlFlow::doInitialization(Module &M) { + LLVMContext &Context = M.getContext(); + + Void = Type::getVoidTy(Context); + Boolean = Type::getInt1Ty(Context); + Int64 = Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + Int64Zero = ConstantInt::get(Int64, 0); + + If = M.getOrInsertFunction( + IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); + + Else = M.getOrInsertFunction( + ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); + + Break = M.getOrInsertFunction( + BreakIntrinsic, Int64, Int64, (Type *)nullptr); + + IfBreak = M.getOrInsertFunction( + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); + + ElseBreak = M.getOrInsertFunction( + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); + + Loop = M.getOrInsertFunction( + LoopIntrinsic, Boolean, Int64, (Type *)nullptr); + + EndCf = M.getOrInsertFunction( + EndCfIntrinsic, Void, Int64, (Type *)nullptr); + + return false; +} + +/// \brief Is BB the last block saved on the stack ? +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { + return !Stack.empty() && Stack.back().first == BB; +} + +/// \brief Pop the last saved value from the control flow stack +Value *SIAnnotateControlFlow::popSaved() { + return Stack.pop_back_val().second; +} + +/// \brief Push a BB and saved value to the control flow stack +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { + Stack.push_back(std::make_pair(BB, Saved)); +} + +/// \brief Can the condition represented by this PHI node treated like +/// an "Else" block? +bool SIAnnotateControlFlow::isElse(PHINode *Phi) { + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + if (Phi->getIncomingBlock(i) == IDom) { + + if (Phi->getIncomingValue(i) != BoolTrue) + return false; + + } else { + if (Phi->getIncomingValue(i) != BoolFalse) + return false; + + } + } + return true; +} + +// \brief Erase "Phi" if it is not used any more +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + if (!Phi->hasNUsesOrMore(1)) + Phi->eraseFromParent(); +} + +/// \brief Open a new "If" block +void SIAnnotateControlFlow::openIf(BranchInst *Term) { + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Close the last "If" block and open a new "Else" block +void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + Value *Ret = CallInst::Create(Else, popSaved(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Recursively handle the condition leading to a loop +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L) { + + // Only search through PHI nodes which are inside the loop. If we try this + // with PHI nodes that are outside of the loop, we end up inserting new PHI + // nodes outside of the loop which depend on values defined inside the loop. + // This will break the module with + // 'Instruction does not dominate all users!' errors. + PHINode *Phi = nullptr; + if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { + + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; + + // Handle all non-constant incoming values first + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa<ConstantInt>(Incoming)) { + NewPhi->addIncoming(Broken, From); + continue; + } + + Phi->setIncomingValue(i, BoolFalse); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L); + NewPhi->addIncoming(PhiArg, From); + } + + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); + + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + + Value *Incoming = Phi->getIncomingValue(i); + if (Incoming != BoolTrue) + continue; + + BasicBlock *From = Phi->getIncomingBlock(i); + if (From == IDom) { + CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); + if (OldEnd && OldEnd->getCalledFunction() == EndCf) { + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); + continue; + } + } + TerminatorInst *Insert = From->getTerminator(); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); + } + eraseIfUnused(Phi); + return Ret; + + } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + BasicBlock *Parent = Inst->getParent(); + Instruction *Insert; + if (L->contains(Inst)) { + Insert = Parent->getTerminator(); + } else { + Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + } + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); + + } else { + llvm_unreachable("Unhandled loop condition!"); + } + return 0; +} + +/// \brief Handle a back edge (loop) +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + BasicBlock *BB = Term->getParent(); + llvm::Loop *L = LI->getLoopFor(BB); + BasicBlock *Target = Term->getSuccessor(1); + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + + Value *Cond = Term->getCondition(); + Term->setCondition(BoolTrue); + Value *Arg = handleLoopCondition(Cond, Broken, L); + + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); + PI != PE; ++PI) { + + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + } + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + push(Term->getSuccessor(0), Arg); +}/// \brief Close the last opened control flow +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + llvm::Loop *L = LI->getLoopFor(BB); + + if (L && L->getHeader() == BB) { + // We can't insert an EndCF call into a loop header, because it will + // get executed on every iteration of the loop, when it should be + // executed only once before the loop. + SmallVector <BasicBlock*, 8> Latches; + L->getLoopLatches(Latches); + + std::vector<BasicBlock*> Preds; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) + Preds.push_back(*PI); + } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, + LI, false); + } + + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); +} + +/// \brief Annotate the control flow with intrinsics so the backend can +/// recognize if/then/else and loops. +bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), + E = df_end(&F.getEntryBlock()); I != E; ++I) { + + BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + + if (!Term || Term->isUnconditional()) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + continue; + } + + if (I.nodeVisited(Term->getSuccessor(1))) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + handleLoop(Term); + continue; + } + + if (isTopOfStack(*I)) { + PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); + if (Phi && Phi->getParent() == *I && isElse(Phi)) { + insertElse(Term); + eraseIfUnused(Phi); + continue; + } + closeControlFlow(*I); + } + openIf(Term); + } + + assert(Stack.empty()); + return true; +} + +/// \brief Create the annotation pass +FunctionPass *llvm::createSIAnnotateControlFlowPass() { + return new SIAnnotateControlFlow(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h new file mode 100644 index 0000000..f1b4ba1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -0,0 +1,172 @@ +//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCInstrDesc.h" + +#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H +#define LLVM_LIB_TARGET_R600_SIDEFINES_H + +namespace SIInstrFlags { +// This needs to be kept in sync with the field bits in InstSI. +enum { + SALU = 1 << 3, + VALU = 1 << 4, + + SOP1 = 1 << 5, + SOP2 = 1 << 6, + SOPC = 1 << 7, + SOPK = 1 << 8, + SOPP = 1 << 9, + + VOP1 = 1 << 10, + VOP2 = 1 << 11, + VOP3 = 1 << 12, + VOPC = 1 << 13, + + MUBUF = 1 << 14, + MTBUF = 1 << 15, + SMRD = 1 << 16, + DS = 1 << 17, + MIMG = 1 << 18, + FLAT = 1 << 19, + WQM = 1 << 20, + VGPRSpill = 1 << 21 +}; +} // namespace SIInstrFlags + +namespace llvm { +namespace AMDGPU { + enum OperandType { + /// Operand with register or 32-bit immediate + OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + /// Operand with register or inline constant + OPERAND_REG_INLINE_C + }; +} +} + +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; + + // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. + // The result is true if any of these tests are true. + enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; +} // namespace SIInstrFlags + +namespace SISrcMods { + enum { + NEG = 1 << 0, + ABS = 1 << 1 + }; +} + +namespace SIOutMods { + enum { + NONE = 0, + MUL2 = 1, + MUL4 = 2, + DIV2 = 3 + }; +} + +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C +#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) + +#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) + + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp new file mode 100644 index 0000000..5fe8d19 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -0,0 +1,96 @@ +//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Spilling of EXEC masks used for control flow messes up control flow +/// lowering, so mark all live intervals associated with CF instructions as +/// non-spillable. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-cf-live-intervals" + +namespace { + +class SIFixControlFlowLiveIntervals : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix CF Live Intervals"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) + +char SIFixControlFlowLiveIntervals::ID = 0; + +char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; + +FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { + return new SIFixControlFlowLiveIntervals(); +} + +bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + case AMDGPU::SI_END_CF: { + unsigned Reg = MI.getOperand(0).getReg(); + LIS->getInterval(Reg).markNotSpillable(); + break; + } + default: + break; + } + } + } + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp new file mode 100644 index 0000000..23502b4 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -0,0 +1,338 @@ +//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Copies from VGPR to SGPR registers are illegal and the register coalescer +/// will sometimes generate these illegal copies in situations like this: +/// +/// Register Class <vsrc> is the union of <vgpr> and <sgpr> +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// %vreg1 <vsrc> = COPY %vreg0 <sgpr> +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> +/// +/// +/// The coalescer will begin at BB0 and eliminate its copy, then the resulting +/// code will look like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now that the result of the PHI instruction is an SGPR, the register +/// allocator is now forced to constrain the register class of %vreg3 to +/// <sgpr> so we end up with final code like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <sgpr> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now this code contains an illegal copy from a VGPR to an SGPR. +/// +/// In order to avoid this problem, this pass searches for PHI instructions +/// which define a <vsrc> register and constrains its definition class to +/// <vgpr> if the user of the PHI's definition register is a vector instruction. +/// If the PHI's definition class is constrained to <vgpr> then the coalescer +/// will be unable to perform the COPY removal from the above example which +/// ultimately led to the creation of an illegal COPY. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "sgpr-copies" + +namespace { + +class SIFixSGPRCopies : public MachineFunctionPass { + +private: + static char ID; + const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const; + const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const; + bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const; + +public: + SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR copies"; + } + +}; + +} // End anonymous namespace + +char SIFixSGPRCopies::ID = 0; + +FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { + return new SIFixSGPRCopies(tm); +} + +static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + continue; + + if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + return true; + } + return false; +} + +/// This functions walks the use list of Reg until it finds an Instruction +/// that isn't a COPY returns the register class of that instruction. +/// \return The register defined by the first non-COPY instruction. +const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const { + + const TargetRegisterClass *RC + = TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + TRI->getPhysRegClass(Reg); + + RC = TRI->getSubRegClass(RC, SubReg); + for (MachineRegisterInfo::use_instr_iterator + I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { + switch (I->getOpcode()) { + case AMDGPU::COPY: + RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, + I->getOperand(0).getReg(), + I->getOperand(0).getSubReg())); + break; + } + } + + return RC; +} + +const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); + return TRI->getSubRegClass(RC, SubReg); + } + MachineInstr *Def = MRI.getVRegDef(Reg); + if (Def->getOpcode() != AMDGPU::COPY) { + return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); + } + + return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), + Def->getOperand(1).getSubReg()); +} + +bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const { + + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); + + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + return false; + } + + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + + const TargetRegisterClass *SrcRC; + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) + return false; + + SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); + return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); +} + +bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); + DEBUG(MI.print(dbgs())); + TII->moveToVALU(MI); + + } + + switch (MI.getOpcode()) { + default: continue; + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + const MachineOperand &Op = MI.getOperand(i); + unsigned Reg = Op.getReg(); + const TargetRegisterClass *RC + = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + + MRI.constrainRegClass(Op.getReg(), RC); + } + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, + MI.getOperand(0).getSubReg()); + if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { + MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); + } + + if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) + break; + + // If a PHI node defines an SGPR and any of its operands are VGPRs, + // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; + for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { + unsigned Reg = MI.getOperand(i).getReg(); + if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { + TII->moveToVALU(MI); + break; + } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } + } + + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); + break; + } + case AMDGPU::REG_SEQUENCE: { + if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || + !hasVGPROperands(MI, TRI)) + continue; + + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); + + TII->moveToVALU(MI); + break; + } + case AMDGPU::INSERT_SUBREG: { + const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; + DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + if (TRI->isSGPRClass(DstRC) && + (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); + TII->moveToVALU(MI); + } + break; + } + } + } + } + + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp new file mode 100644 index 0000000..0c54446 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,192 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// SALU instructions ignore control flow, so we need to modify the live ranges +/// of the registers they define in some cases. +/// +/// The main case we need to handle is when a def is used in one side of a +/// branch and not another. For example: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// +/// Here we need the register allocator to avoid assigning any of the defs +/// inside of the IF to the same register as %def. In traditional live +/// interval analysis %def is not live inside the IF branch, however, since +/// SALU instructions inside of IF will be executed even if the branch is not +/// taken, there is the chance that one of the instructions will overwrite the +/// value of %def, so the use in ELSE will see the wrong value. +/// +/// The strategy we use for solving this is to add an extra use after the ENDIF: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// %use +/// +/// Adding this use will make the def live thoughout the IF branch, which is +/// what we want. + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + + // First pass, collect all live intervals for SGPRs + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.defs()) { + if (MO.isImplicit()) + continue; + unsigned Def = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def)) { + if (TRI->isSGPRClass(MRI.getRegClass(Def))) + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getInterval(Def))); + } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getRegUnit(Def))); + } + } + } + } + + // Second pass fix the intervals + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + MachineBasicBlock &MBB = *BI; + if (MBB.succ_size() < 2) + continue; + + // We have structured control flow, so number of succesors should be two. + assert(MBB.succ_size() == 2); + MachineBasicBlock *SuccA = *MBB.succ_begin(); + MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); + + if (!NCD) + continue; + + MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); + + if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { + assert(NCD->succ_size() == 2); + // We want to make sure we insert the Use after the ENDIF, not after + // the ELSE. + NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), + *(++NCD->succ_begin())); + } + assert(SuccA && SuccB); + for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { + unsigned Reg = RegLR.first; + LiveRange *LR = RegLR.second; + + // FIXME: We could be smarter here. If the register is Live-In to + // one block, but the other doesn't have any SGPR defs, then there + // won't be a conflict. Also, if the branch decision is based on + // a value in an SGPR, then there will be no conflict. + bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); + bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); + + if ((!LiveInToA && !LiveInToB) || + (LiveInToA && LiveInToB)) + continue; + + // This interval is live in to one successor, but not the other, so + // we need to update its range so it is live in to both. + DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << + " BB#" << SuccA->getNumber() << ", BB#" << + SuccB->getNumber() << + " with NCD = " << NCD->getNumber() << '\n'); + + // FIXME: Need to figure out how to update LiveRange here so this pass + // will be able to preserve LiveInterval analysis. + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + DEBUG(NCD->getFirstNonPHI()->dump()); + } + } + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp new file mode 100644 index 0000000..d14e37a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -0,0 +1,288 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + unsigned UseOpNo; + MachineOperand *OpToFold; + uint64_t ImmToFold; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), UseOpNo(OpNo) { + + if (FoldOp->isImm()) { + OpToFold = nullptr; + ImmToFold = FoldOp->getImm(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isImm() const { + return !OpToFold; + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(FoldCandidate &Fold, + const TargetRegisterInfo &TRI) { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } + + MachineOperand *New = Fold.OpToFold; + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New->getReg())) { + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + return true; + } + + // FIXME: Handle physical registers. + + return false; +} + +static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *OpToFold, + const SIInstrInfo *TII) { + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + // Operand is not legal, so try to commute the instruction to + // see if this makes it possible to fold. + unsigned CommuteIdx0; + unsigned CommuteIdx1; + bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + + if (CanCommute) { + if (CommuteIdx0 == OpNo) + OpNo = CommuteIdx1; + else if (CommuteIdx1 == OpNo) + OpNo = CommuteIdx0; + } + + if (!CanCommute || !TII->commuteInstruction(MI)) + return false; + + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + unsigned OpSize = TII->getOpSize(MI, 1); + MachineOperand &OpToFold = MI.getOperand(1); + bool FoldingImm = OpToFold.isImm(); + + // FIXME: We could also be folding things like FrameIndexes and + // TargetIndexes. + if (!FoldingImm && !OpToFold.isReg()) + continue; + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && + !MRI.hasOneUse(MI.getOperand(0).getReg())) + continue; + + // FIXME: Fold operands with subregs. + if (OpToFold.isReg() && + (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || + OpToFold.getSubReg())) + continue; + + std::vector<FoldCandidate> FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + continue; + } + + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + continue; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + continue; + + UseMI->setDesc(TII->get(MovOp)); + } + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) + continue; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); + continue; + } + + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + } + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, TRI)) { + // Clear kill flags. + if (!Fold.isImm()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + Fold.OpToFold->setIsKill(false); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + } + } + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp new file mode 100644 index 0000000..12d08cf --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -0,0 +1,2241 @@ +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for SI +// +//===----------------------------------------------------------------------===// + +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include <cmath> +#endif + +#include "SIISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" + +using namespace llvm; + +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { + addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); + + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Promote); + AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i1, Custom); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // These should use UDIVREM, so set them to expand + setOperationAction(ISD::UDIV, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch(Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + } + + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::UINT_TO_FP); + + // All memory operations. Some folding on the pointer operand is done to help + // matching the constant offsets in the addressing modes. + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); + setTargetDAGCombine(ISD::ATOMIC_SWAP); + setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); + setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); + setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + + setSchedulingPreference(Sched::RegPressure); +} + +//===----------------------------------------------------------------------===// +// TargetLowering queries +//===----------------------------------------------------------------------===// + +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + +bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty, unsigned AS) const { + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // SMRD instructions have an 8-bit, dword offset. + // + // Assume nonunifom access, since the address space isn't enough to know + // what instruction we will use, and since we don't know if this is a load + // or store and scalar stores are only available on VI. + // + // We also know if we are doing an extload, we can't do a scalar load. + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // Basic, single offset DS instructions allow a 16-bit unsigned immediate + // field. + // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have + // an 8-bit dword offset but we don't know the alignment here. + if (!isUInt<16>(AM.BaseOffs)) + return false; + + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. + return true; + + if (AM.Scale == 1 && AM.HasBaseReg) + return true; + + return false; + } + case AMDGPUAS::FLAT_ADDRESS: { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); + } + default: + llvm_unreachable("unhandled address space"); + } +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + if (!VT.isSimple() || VT == MVT::Other) + return false; + + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + return Align % 4 == 0; + } + + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + return TII->isInlineConstant(Imm); +} + +static EVT toIntegerVT(EVT VT) { + if (VT.isVector()) + return VT.changeVectorElementTypeToInteger(); + return MVT::getIntegerVT(VT.getSizeInBits()); +} + +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + SDLoc SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout *DL = getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); + SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + unsigned Align = DL->getABITypeAlignment(Ty); + + if (VT != MemVT && VT.isFloatingPoint()) { + // Do an integer load and convert. + // FIXME: This is mostly because load legalization after type legalization + // doesn't handle FP extloads. + assert(VT.getScalarType() == MVT::f32 && + MemVT.getScalarType() == MVT::f16); + + EVT IVT = toIntegerVT(VT); + EVT MemIVT = toIntegerVT(MemVT); + SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, + IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment + return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); + } + + ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, + VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment +} + +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + assert(CallConv == CallingConv::C); + + SmallVector<ISD::InputArg, 16> Splits; + BitVector Skipped(Ins.size()); + + for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + + // First check if it's a PS input addr + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal()) { + + assert((PSInputNum <= 15) && "Too many PS inputs!"); + + if (!Arg.Used) { + // We can savely skip PS inputs + Skipped.set(i); + ++PSInputNum; + continue; + } + + Info->PSInputAddr |= 1 << PSInputNum++; + } + + // Second split vertices into their elements + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eigth. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + + } else if (Info->getShaderType() != ShaderType::COMPUTE) { + Splits.push_back(Arg); + } + } + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // At least one interpolation mode must be enabled or else the GPU will hang. + if (Info->getShaderType() == ShaderType::PIXEL && + (Info->PSInputAddr & 0x7F) == 0) { + Info->PSInputAddr |= 1; + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + } + + // The pointer to the list of arguments is stored in SGPR0, SGPR1 + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 + if (Info->getShaderType() == ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS()) + Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + else + Info->NumUserSGPRs = 4; + + unsigned InputPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrRegLo = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned InputPtrRegHi = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchPtrRegLo = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned ScratchPtrRegHi = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); + + CCInfo.AllocateReg(InputPtrRegLo); + CCInfo.AllocateReg(InputPtrRegHi); + CCInfo.AllocateReg(ScratchPtrRegLo); + CCInfo.AllocateReg(ScratchPtrRegHi); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); + } + + if (Info->getShaderType() == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + + AnalyzeFormalArguments(CCInfo, Splits); + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; + const unsigned Offset = 36 + VA.getLocMemOffset(); + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + Offset, Ins[i].Flags.isSExt()); + + const PointerType *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + Info->ABIArgOffset = Offset + MemVT.getStoreSize(); + continue; + } + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + + if (VT == MVT::i64) { + // For now assume it is a pointer + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, + &AMDGPU::SReg_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + continue; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + if (Arg.VT.isVector()) { + + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + continue; + } + + InVals.push_back(Val); + } + + if (Info->getShaderType() != ShaderType::COMPUTE) { + unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); + Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + } + return Chain; +} + +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + + MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + switch (MI->getOpcode()) { + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::BRANCH: + return BB; + case AMDGPU::SI_RegisterStorePseudo: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), + Reg); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) + MIB.addOperand(MI->getOperand(i)); + + MI->eraseFromParent(); + break; + } + } + return BB; +} + +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + +EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { + if (!VT.isVector()) { + return MVT::i1; + } + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); +} + +MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { + return MVT::i32; +} + +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::GlobalAddress: { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return LowerGlobalAddress(MFI, Op, DAG); + } + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + } + return SDValue(); +} + +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + + SDNode *Parent = Value.getNode(); + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); + I != E; ++I) { + + if (I.getUse().get() != Value) + continue; + + if (I->getOpcode() == Opcode) + return *I; + } + return nullptr; +} + +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); + unsigned FrameIndex = FINode->getIndex(); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + +/// This transforms the control flow intrinsics to get the branch destination as +/// last parameter, also switches branch target with BR if the need arise +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, + SelectionDAG &DAG) const { + + SDLoc DL(BRCOND); + + SDNode *Intr = BRCOND.getOperand(1).getNode(); + SDValue Target = BRCOND.getOperand(2); + SDNode *BR = nullptr; + + if (Intr->getOpcode() == ISD::SETCC) { + // As long as we negate the condition everything is fine + SDNode *SetCC = Intr; + assert(SetCC->getConstantOperandVal(1) == 1); + assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE); + Intr = SetCC->getOperand(0).getNode(); + + } else { + // Get the target from BR if we don't negate the condition + BR = findUser(BRCOND, ISD::BR); + Target = BR->getOperand(1); + } + + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + + // Build the result and + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); + + // operands of the new intrinsic call + SmallVector<SDValue, 4> Ops; + Ops.push_back(BRCOND.getOperand(0)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); + Ops.push_back(Target); + + // build the new intrinsic call + SDNode *Result = DAG.getNode( + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, + DAG.getVTList(Res), Ops).getNode(); + + if (BR) { + // Give the branch instruction our target + SDValue Ops[] = { + BR->getOperand(0), + BRCOND.getOperand(2) + }; + SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); + DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); + BR = NewBR.getNode(); + } + + SDValue Chain = SDValue(Result, Result->getNumValues() - 1); + + // Copy the intrinsic results to registers + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); + if (!CopyToReg) + continue; + + Chain = DAG.getCopyToReg( + Chain, DL, + CopyToReg->getOperand(1), + SDValue(Result, i - 1), + SDValue()); + + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); + } + + // Remove the old intrinsic from the chain + DAG.ReplaceAllUsesOfValueWith( + SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); + + return Chain; +} + +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(GSD->getAddressSpace()); + + SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + + SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(0, DL, MVT::i32)); + SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(1, DL, MVT::i32)); + + SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrLo, GA); + SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrHi, DAG.getConstant(0, DL, MVT::i32), + SDValue(Lo.getNode(), 1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + SDValue V) const { + // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, + // so we will end up with redundant moves to m0. + // + // We can't use S_MOV_B32, because there is no way to specify m0 as the + // destination register. + // + // We have to use them both. Machine cse will combine all the S_MOV_B32 + // instructions and the register coalescer eliminate the extra copies. + SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); + return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), + SDValue(M0, 0), SDValue()); // Glue + // A Null SDValue creates + // a glue result. +} + +SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + switch (IntrinsicID) { + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_X, false); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Z, false); + + case Intrinsic::AMDGPU_read_workdim: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, + false); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops[] = { + Op.getOperand(1), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + case AMDGPUIntrinsic::SI_fs_constant: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(1), Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_fs_interp: { + SDValue IJ = Op.getOperand(4); + SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(0, DL, MVT::i32)); + SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(1, DL, MVT::i32)); + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, + DAG.getVTList(MVT::f32, MVT::Glue), + I, Op.getOperand(1), Op.getOperand(2), Glue); + Glue = SDValue(P1.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, + Op.getOperand(1), Op.getOperand(2), Glue); + } + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + } +} + +SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_sendmsg: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: + return SDValue(); + } +} + +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + return ScalarizeVectorLoad(Op, DAG); + } + } + + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4)); +} + +SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); + + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); + + SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); + + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); + + SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); + + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); +} + +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + } + + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; + + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) + return SDValue(); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT VT = Store->getMemoryVT(); + + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return ScalarizeVectorStore(Op, DAG); + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return ScalarizeVectorStore(Op, DAG); + + if (VT == MVT::i1) + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + + return SDValue(); +} + +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.5/M_PI, DL, + VT))); + + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); + } +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } + + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { + return SDValue(); + } + + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); + + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); + + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + LoadSDNode *Load = cast<LoadSDNode>(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector<SDValue, 4> Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector<SDValue, 4> Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} + +/// \brief Return true if the given offset Size in bytes can be folded into +/// the immediate offsets of a memory instruction for the given address space. +static bool canFoldOffset(unsigned OffsetSize, unsigned AS, + const AMDGPUSubtarget &STI) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + +// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) + +// This is a variant of +// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), +// +// The normal DAG combiner will do this, but only if the add has one use since +// that would increase the number of instructions. +// +// This prevents us from seeing a constant offset that can be folded into a +// memory instruction's addressing mode. If we know the resulting add offset of +// a pointer can be folded into an addressing offset, we can replace the pointer +// operand with the add of new constant offset. This eliminates one of the uses, +// and may allow the remaining use to also be simplified. +// +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, + unsigned AddrSpace, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::ADD) + return SDValue(); + + const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); + if (!CN1) + return SDValue(); + + const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!CAdd) + return SDValue(); + + // If the resulting offset is too large, we can't fold it into the addressing + // mode offset. + APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + + SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); + SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); +} + +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + X, DAG.getConstant(Mask, DL, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, SDLoc(N), MVT::i1); + } + + return SDValue(); +} + +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case ISD::SMAX: + return AMDGPUISD::SMAX3; + case ISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case ISD::SMIN: + return AMDGPUISD::SMIN3; + case ISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(Mask, SL, MVT::i32)); + } + } + + return SDValue(); +} + +SDValue SITargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch (N->getOpcode()) { + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + + case ISD::FADD: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32) + break; + + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); + } + } + + return SDValue(); + } + case ISD::FSUB: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); + } + } + + return SDValue(); + } + + break; + } + } + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + if (DCI.isBeforeLegalize()) + break; + + MemSDNode *MemNode = cast<MemSDNode>(N); + SDValue Ptr = MemNode->getBasePtr(); + + // TODO: We could also do this for multiplies. + unsigned AS = MemNode->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); + } + } + break; + } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); + } + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +/// \brief Analyze the possible immediate value Op +/// +/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate +/// and the immediate value if it's a literal immediate +int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; + + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; + } + + if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; + + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); + + return -1; + } + + return -1; +} + +/// \brief Helper function for adjustWritemask +static unsigned SubIdx2Lane(unsigned Idx) { + switch (Idx) { + default: return 0; + case AMDGPU::sub0: return 0; + case AMDGPU::sub1: return 1; + case AMDGPU::sub2: return 2; + case AMDGPU::sub3: return 3; + } +} + +/// \brief Adjust the writemask of MIMG instructions +void SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { }; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; + + // Try to figure out the used register components + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); + I != E; ++I) { + + // Abort if we can't understand the usage + if (!I->isMachineOpcode() || + I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return; + + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. + Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + + // Abort if we have more than one user per component + if (Users[Lane]) + return; + + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } + + // Abort if there's no change + if (NewDmask == OldDmask) + return; + + // Adjust the writemask in the node + std::vector<SDValue> Ops; + Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); + + // If we only got one lane, replace it with a copy + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), + MVT::i32); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + SDLoc(), Users[Lane]->getValueType(0), + SDValue(Node, 0), RC); + DAG.ReplaceAllUsesWith(Users[Lane], Copy); + return; + } + + // Update the users of the node with the new indices + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + + SDNode *User = Users[i]; + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + + switch (Idx) { + default: break; + case AMDGPU::sub0: Idx = AMDGPU::sub1; break; + case AMDGPU::sub1: Idx = AMDGPU::sub2; break; + case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + } + } +} + +/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// with frame index operands. +/// LLVM assumes that inputs are to these instructions are registers. +void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < Node->getNumOperands(); ++i) { + if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { + Ops.push_back(Node->getOperand(i)); + continue; + } + + SDLoc DL(Node); + Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, + Node->getOperand(i).getValueType(), + Node->getOperand(i)), 0)); + } + + DAG.UpdateNodeOperands(Node, Ops); +} + +/// \brief Fold the instructions after selecting them. +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + if (TII->isMIMG(Node->getMachineOpcode())) + adjustWritemask(Node, DAG); + + if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || + Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + legalizeTargetIndependentNode(Node, DAG); + return Node; + } + return Node; +} + +/// \brief Assign the register class depending on the number of +/// bits set in the writemask +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + TII->legalizeOperands(MI); + + if (TII->isMIMG(MI->getOpcode())) { + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); + MRI.setRegClass(VReg, RC); + return; + } + + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + if (NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + } + + return; + } +} + +static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} + +MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); +#if 1 + // XXX - Workaround for moveToVALU not handling different register class + // inserts for REG_SEQUENCE. + + // Build the half of the subregister with the constants. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); + + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); +#else + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + +#endif +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const { + SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, DL, MVT::i32)), + 0); + } + + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); +} + +MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + return buildRSRC(DAG, DL, Ptr, 0, Rsrc); +} + +SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), + cast<RegisterSDNode>(VReg)->getReg(), VT); +} + +//===----------------------------------------------------------------------===// +// SI Inline Assembly Support +//===----------------------------------------------------------------------===// + +std::pair<unsigned, const TargetRegisterClass *> +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, + MVT VT) const { + if (Constraint == "r") { + switch(VT.SimpleTy) { + default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); + case MVT::i64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case MVT::i32: + return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + } + } + + if (Constraint.size() > 1) { + const TargetRegisterClass *RC = nullptr; + if (Constraint[1] == 'v') { + RC = &AMDGPU::VGPR_32RegClass; + } else if (Constraint[1] == 's') { + RC = &AMDGPU::SGPR_32RegClass; + } + + if (RC) { + unsigned Idx = std::atoi(Constraint.substr(2).c_str()); + if (Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h new file mode 100644 index 0000000..a956b01 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -0,0 +1,125 @@ +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H + +#include "AMDGPUISelLowering.h" +#include "SIInstrInfo.h" + +namespace llvm { + +class SITargetLowering : public AMDGPUTargetLowering { + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, + SDValue Chain, unsigned Offset, bool Signed) const; + SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, + SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + + SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, + unsigned AS, + DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + +public: + SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + + bool isLegalAddressingMode(const AddrMode &AM, + Type *Ty, unsigned AS) const override; + + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + MVT getScalarShiftAmountTy(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const override; + + int32_t analyzeImmediate(const SDNode *N) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const override; + void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const; + MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const; + + std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const override; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp new file mode 100644 index 0000000..90a37f1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -0,0 +1,480 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +/// \brief One variable for each of the hardware counters +typedef union { + struct { + unsigned VM; + unsigned EXP; + unsigned LGKM; + } Named; + unsigned Array[3]; + +} Counters; + +typedef enum { + OTHER, + SMEM, + VMEM +} InstType; + +typedef Counters RegCounters[512]; +typedef std::pair<unsigned, unsigned> RegInterval; + +class SIInsertWaits : public MachineFunctionPass { + +private: + static char ID; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + + /// \brief Constant hardware limits + static const Counters WaitCounts; + + /// \brief Constant zero value + static const Counters ZeroCounts; + + /// \brief Counter values we have already waited on. + Counters WaitedOn; + + /// \brief Counter values for last instruction issued. + Counters LastIssued; + + /// \brief Registers used by async instructions. + RegCounters UsedRegs; + + /// \brief Registers defined by async instructions. + RegCounters DefinedRegs; + + /// \brief Different export instruction types seen since last wait. + unsigned ExpInstrTypesSeen; + + /// \brief Type of the last opcode. + InstType LastOpcodeType; + + bool LastInstWritesM0; + + /// \brief Get increment/decrement amount for this instruction. + Counters getHwCounts(MachineInstr &MI); + + /// \brief Is operand relevant for async execution? + bool isOpRelevant(MachineOperand &Op); + + /// \brief Get register interval an operand affects. + RegInterval getRegInterval(MachineOperand &Op); + + /// \brief Handle instructions async components + void pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + /// \brief Insert the actual wait instruction + bool insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Counts); + + /// \brief Do we need def2def checks? + bool unorderedDefines(MachineInstr &MI); + + /// \brief Resolve all operand dependencies to counter requirements + Counters handleOperands(MachineInstr &MI); + + /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. + void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + +public: + SIInsertWaits(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(nullptr), + TRI(nullptr), + ExpInstrTypesSeen(0) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI insert wait instructions"; + } + +}; + +} // End anonymous namespace + +char SIInsertWaits::ID = 0; + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; + +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { + return new SIInsertWaits(tm); +} + +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { + + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + Counters Result; + + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); + + // Only consider stores or EXP for EXP_CNT + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && + (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + + // LGKM may uses larger values + if (TSFlags & SIInstrFlags::LGKM_CNT) { + + if (TII->isSMRD(MI.getOpcode())) { + + MachineOperand &Op = MI.getOperand(0); + assert(Op.isReg() && "First LGKM operand must be a register!"); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + + } else { + // DS + Result.Named.LGKM = 1; + } + + } else { + Result.Named.LGKM = 0; + } + + return Result; +} + +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { + + // Constants are always irrelevant + if (!Op.isReg()) + return false; + + // Defines are always relevant + if (Op.isDef()) + return true; + + // For exports all registers are relevant + MachineInstr &MI = *Op.getParent(); + if (MI.getOpcode() == AMDGPU::EXP) + return true; + + // For stores the stored value is also relevant + if (!MI.getDesc().mayStore()) + return false; + + // Check if this operand is the value being stored. + // Special case for DS instructions, since the address + // operand comes before the value operand and it may have + // multiple data operands. + + if (TII->isDS(MI.getOpcode())) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); + if (Data && Op.isIdenticalTo(*Data)) + return true; + + MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + if (Data0 && Op.isIdenticalTo(*Data0)) + return true; + + MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data1 && Op.isIdenticalTo(*Data1)) + return true; + + return false; + } + + // NOTE: This assumes that the value operand is before the + // address operand, and that there is only one value operand. + for (MachineInstr::mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); I != E; ++I) { + + if (I->isReg() && I->isUse()) + return Op.isIdenticalTo(*I); + } + + return false; +} + +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { + + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + return std::make_pair(0, 0); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + + assert(Size >= 4); + + RegInterval Result; + Result.first = TRI->getEncodingValue(Reg); + Result.second = Result.first + Size / 4; + + return Result; +} + +void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + // Get the hardware counter increments and sum them up + Counters Increment = getHwCounts(*I); + unsigned Sum = 0; + + for (unsigned i = 0; i < 3; ++i) { + LastIssued.Array[i] += Increment.Array[i]; + Sum += Increment.Array[i]; + } + + // If we don't increase anything then that's it + if (Sum == 0) { + LastOpcodeType = OTHER; + return; + } + + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + (LastOpcodeType == VMEM && Increment.Named.VM)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + LastInstWritesM0 = false; + } + + if (TII->isSMRD(I->getOpcode())) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) + LastOpcodeType = VMEM; + } + + // Remember which export instructions we have seen + if (Increment.Named.EXP) { + ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; + } + + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + + MachineOperand &Op = I->getOperand(i); + if (!isOpRelevant(Op)) + continue; + + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + // Remember which registers we define + if (Op.isDef()) + DefinedRegs[j] = LastIssued; + + // and which one we are using + if (Op.isUse()) + UsedRegs[j] = LastIssued; + } + } +} + +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Required) { + + // End of program? No need to wait on anything + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + return false; + + // Figure out if the async instructions execute in order + bool Ordered[3]; + + // VM_CNT is always ordered + Ordered[0] = true; + + // EXP_CNT is unordered if we have both EXP & VM-writes + Ordered[1] = ExpInstrTypesSeen == 3; + + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS + Ordered[2] = false; + + // The values we are going to put into the S_WAITCNT instruction + Counters Counts = WaitCounts; + + // Do we really need to wait? + bool NeedWait = false; + + for (unsigned i = 0; i < 3; ++i) { + + if (Required.Array[i] <= WaitedOn.Array[i]) + continue; + + NeedWait = true; + + if (Ordered[i]) { + unsigned Value = LastIssued.Array[i] - Required.Array[i]; + + // Adjust the value to the real hardware possibilities. + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + + } else + Counts.Array[i] = 0; + + // Remember on what we have waited on. + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + } + + if (!NeedWait) + return false; + + // Reset EXP_CNT instruction types + if (Counts.Named.EXP == 0) + ExpInstrTypesSeen = 0; + + // Build the wait instruction + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm((Counts.Named.VM & 0xF) | + ((Counts.Named.EXP & 0x7) << 4) | + ((Counts.Named.LGKM & 0x7) << 8)); + + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + return true; +} + +/// \brief helper function for handleOperands +static void increaseCounters(Counters &Dst, const Counters &Src) { + + for (unsigned i = 0; i < 3; ++i) + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); +} + +Counters SIInsertWaits::handleOperands(MachineInstr &MI) { + + Counters Result = ZeroCounts; + + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (MI.getOpcode() == AMDGPU::S_SENDMSG) + return LastIssued; + + // For each register affected by this + // instruction increase the result sequence + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + if (Op.isDef()) { + increaseCounters(Result, UsedRegs[j]); + increaseCounters(Result, DefinedRegs[j]); + } + + if (Op.isUse()) + increaseCounters(Result, DefinedRegs[j]); + } + } + + return Result; +} + +void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < + AMDGPUSubtarget::VOLCANIC_ISLANDS) + return; + + // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. + if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + LastInstWritesM0 = false; + return; + } + + // Set whether this instruction sets M0 + LastInstWritesM0 = false; + + unsigned NumOperands = I->getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + const MachineOperand &Op = I->getOperand(i); + + if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) + LastInstWritesM0 = true; + } +} + +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { + bool Changes = false; + + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + + MRI = &MF.getRegInfo(); + + WaitedOn = ZeroCounts; + LastIssued = ZeroCounts; + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + + memset(&UsedRegs, 0, sizeof(UsedRegs)); + memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + + // Wait for everything before a barrier. + if (I->getOpcode() == AMDGPU::S_BARRIER) + Changes |= insertWait(MBB, I, LastIssued); + else + Changes |= insertWait(MBB, I, handleOperands(*I)); + + pushInstruction(MBB, I); + handleSendMsg(MBB, I); + } + + // Wait for everything at the end of the MBB + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + } + + return Changes; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td new file mode 100644 index 0000000..211666a --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -0,0 +1,673 @@ +//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { + + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; + + field bits<1> SALU = 0; + field bits<1> VALU = 0; + + field bits<1> SOP1 = 0; + field bits<1> SOP2 = 0; + field bits<1> SOPC = 0; + field bits<1> SOPK = 0; + field bits<1> SOPP = 0; + + field bits<1> VOP1 = 0; + field bits<1> VOP2 = 0; + field bits<1> VOP3 = 0; + field bits<1> VOPC = 0; + + field bits<1> MUBUF = 0; + field bits<1> MTBUF = 0; + field bits<1> SMRD = 0; + field bits<1> DS = 0; + field bits<1> MIMG = 0; + field bits<1> FLAT = 0; + field bits<1> WQM = 0; + field bits<1> VGPRSpill = 0; + + // These need to be kept in sync with the enum in SIInstrFlags. + let TSFlags{0} = VM_CNT; + let TSFlags{1} = EXP_CNT; + let TSFlags{2} = LGKM_CNT; + + let TSFlags{3} = SALU; + let TSFlags{4} = VALU; + + let TSFlags{5} = SOP1; + let TSFlags{6} = SOP2; + let TSFlags{7} = SOPC; + let TSFlags{8} = SOPK; + let TSFlags{9} = SOPP; + + let TSFlags{10} = VOP1; + let TSFlags{11} = VOP2; + let TSFlags{12} = VOP3; + let TSFlags{13} = VOPC; + + let TSFlags{14} = MUBUF; + let TSFlags{15} = MTBUF; + let TSFlags{16} = SMRD; + let TSFlags{17} = DS; + let TSFlags{18} = MIMG; + let TSFlags{19} = FLAT; + let TSFlags{20} = WQM; + let TSFlags{21} = VGPRSpill; + + // Most instructions require adjustments after selection to satisfy + // operand requirements. + let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; +} + +class Enc32 { + field bits<32> Inst; + int Size = 4; +} + +class Enc64 { + field bits<64> Inst; + int Size = 8; +} + +class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; +def VOPDstVCC : VOPDstOperand <VCCReg>; + +let Uses = [EXEC] in { + +class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VALU = 1; +} + +class VOPCCommon <dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { + + let DisableEncoding = "$dst"; + let VOPC = 1; + let Size = 4; +} + +class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + let VOP1 = 1; + let Size = 4; +} + +class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + let VOP2 = 1; + let Size = 4; +} + +class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always prefered, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + + let VOP3 = 1; + let VALU = 1; + + let AsmMatchConverter = "cvtVOP3"; + let isCodeGenOnly = 0; + + int Size = 8; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Scalar operations +//===----------------------------------------------------------------------===// + +class SOP1e <bits<8> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = op; + let Inst{22-16} = sdst; + let Inst{31-23} = 0x17d; //encoding; +} + +class SOP2e <bits<7> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = sdst; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding +} + +class SOPCe <bits<7> op> : Enc32 { + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; +} + +class SOPKe <bits<5> op> : Enc32 { + bits <7> sdst; + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding +} + +class SOPK64e <bits<5> op> : Enc64 { + bits <7> sdst = 0; + bits <16> simm16; + bits <32> imm; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; + + let Inst{63-32} = imm; +} + +class SOPPe <bits<7> op> : Enc32 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding +} + +class SMRDe <bits<5> op, bits<1> imm> : Enc32 { + bits<7> sdst; + bits<7> sbase; + bits<8> offset; + + let Inst{7-0} = offset; + let Inst{8} = imm; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding +} + +let SchedRW = [WriteSALU] in { +class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP1 = 1; +} + +class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP2 = 1; + + let UseNamedOperandTable = 1; +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, SOPCe <op> { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPC = 1; + let isCodeGenOnly = 0; + + let UseNamedOperandTable = 1; +} + +class SOPK <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins , asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPK = 1; + + let UseNamedOperandTable = 1; +} + +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPPe <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + + let UseNamedOperandTable = 1; +} + +} // let SchedRW = [WriteSALU] + +class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let LGKM_CNT = 1; + let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; +} + +//===----------------------------------------------------------------------===// +// Vector ALU operations +//===----------------------------------------------------------------------===// + +class VOP1e <bits<8> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + + let Inst{8-0} = src0; + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +class VOP2e <bits<6> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; + + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP2_MADKe <bits<6> op> : Enc64 { + + bits<8> vdst; + bits<9> src0; + bits<8> vsrc1; + bits<32> src2; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = src2; +} + +class VOP3e <bits<9> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{11} = clamp; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be <bits<9> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOPCe <bits<8> op> : Enc32 { + bits<9> src0; + bits<8> vsrc1; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; +} + +class VINTRPe <bits<2> op> : Enc32 { + bits<8> vdst; + bits<8> vsrc; + bits<2> attrchan; + bits<6> attr; + + let Inst{7-0} = vsrc; + let Inst{9-8} = attrchan; + let Inst{15-10} = attr; + let Inst{17-16} = op; + let Inst{25-18} = vdst; + let Inst{31-26} = 0x32; // encoding +} + +class DSe <bits<8> op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{17} = gds; + let Inst{25-18} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe <bits<7> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{16} = lds; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe <bits<3> op> : Enc64 { + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{18-16} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MIMGe <bits<7> op> : Enc64 { + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<1> glc; + bits<1> da; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<1> slc; + bits<8> vaddr; + bits<7> srsrc; + bits<7> ssamp; + + let Inst{11-8} = dmask; + let Inst{12} = unorm; + let Inst{13} = glc; + let Inst{14} = da; + let Inst{15} = r128; + let Inst{16} = tfe; + let Inst{17} = lwe; + let Inst{24-18} = op; + let Inst{25} = slc; + let Inst{31-26} = 0x3c; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{57-53} = ssamp{6-2}; +} + +class FLATe<bits<7> op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} + +class EXPe : Enc64 { + bits<4> en; + bits<6> tgt; + bits<1> compr; + bits<1> done; + bits<1> vm; + bits<8> vsrc0; + bits<8> vsrc1; + bits<8> vsrc2; + bits<8> vsrc3; + + let Inst{3-0} = en; + let Inst{9-4} = tgt; + let Inst{10} = compr; + let Inst{11} = done; + let Inst{12} = vm; + let Inst{31-26} = 0x3e; + let Inst{39-32} = vsrc0; + let Inst{47-40} = vsrc1; + let Inst{55-48} = vsrc2; + let Inst{63-56} = vsrc3; +} + +let Uses = [EXEC] in { + +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP1Common <outs, ins, asm, pattern>, + VOP1e<op> { + let isCodeGenOnly = 0; +} + +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP2Common <outs, ins, asm, pattern>, VOP2e<op> { + let isCodeGenOnly = 0; +} + +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : + VOPCCommon <ins, asm, pattern>, VOPCe <op>; + +class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class DS <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let LGKM_CNT = 1; + let DS = 1; + let UseNamedOperandTable = 1; + let Uses = [M0]; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; + let AsmMatchConverter = "cvtDS"; + let SchedRW = [WriteLDS]; +} + +class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let AsmMatchConverter = "cvtMubuf"; + let SchedRW = [WriteVMEM]; +} + +class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MTBUF = 1; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; +} + +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, FLATe <op> { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; + let AsmMatchConverter = "cvtFlat"; + let SchedRW = [WriteVMEM]; +} + +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, MIMGe <op> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; + + let hasSideEffects = 0; // XXX ???? +} + + +} // End Uses = [EXEC] diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp new file mode 100644 index 0000000..47bc178 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -0,0 +1,2723 @@ +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#include "SIInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +//===----------------------------------------------------------------------===// +// TargetInstrInfo callbacks +//===----------------------------------------------------------------------===// + +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +/// \brief Returns true if both nodes have the same value for the given +/// operand \p Op, or if both nodes do not have this operand. +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { + unsigned Opc0 = N0->getMachineOpcode(); + unsigned Opc1 = N1->getMachineOpcode(); + + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); + + if (Op0Idx == -1 && Op1Idx == -1) + return true; + + + if ((Op0Idx == -1 && Op1Idx != -1) || + (Op1Idx == -1 && Op0Idx != -1)) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --Op0Idx; + --Op1Idx; + + return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); +} + +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + // TODO: The generic check fails for VALU instructions that should be + // rematerializable due to implicit reads of exec. We really want all of the + // generic logic for this except for this. + switch (MI->getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + + // FIXME: Handle this case: + if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) + return false; + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // Skip read2 / write2 variants for simplicity. + // TODO: We should report true if the used offsets are adjacent (excluded + // st64 versions). + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + const ConstantSDNode *Load0Offset = + dyn_cast<ConstantSDNode>(Load0->getOperand(1)); + const ConstantSDNode *Load1Offset = + dyn_cast<ConstantSDNode>(Load1->getOperand(1)); + + if (!Load0Offset || !Load1Offset) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = Load0Offset->getZExtValue(); + Offset1 = Load1Offset->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + + // MUBUF and MTBUF have vaddr at different indices. + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || + findChainOperand(Load0) != findChainOperand(Load1) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + + if (OffIdx0 == -1 || OffIdx1 == -1) + return false; + + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // inlcude the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + --OffIdx0; + --OffIdx1; + + SDValue Off0 = Load0->getOperand(OffIdx0); + SDValue Off1 = Load1->getOperand(OffIdx1); + + // The offset might be a FrameIndexSDNode. + if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) + return false; + + Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + return true; + } + + return false; +} + +static bool isStride64(unsigned Opc) { + switch (Opc) { + case AMDGPU::DS_READ2ST64_B32: + case AMDGPU::DS_READ2ST64_B64: + case AMDGPU::DS_WRITE2ST64_B32: + case AMDGPU::DS_WRITE2ST64_B64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + unsigned Opc = LdSt->getOpcode(); + if (isDS(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (OffsetImm) { + // Normal, single offset LDS instruction. + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. We + // will use this for some partially aligned loads. + const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset1); + + uint8_t Offset0 = Offset0Imm->getImm(); + uint8_t Offset1 = Offset1Imm->getImm(); + assert(Offset1 > Offset0); + + if (Offset1 - Offset0 == 1) { + // Each of these offsets is in element sized units, so we need to convert + // to bytes of the individual reads. + + unsigned EltSize; + if (LdSt->mayLoad()) + EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + else { + assert(LdSt->mayStore()); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + } + + if (isStride64(Opc)) + EltSize *= 64; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = EltSize * Offset0; + return true; + } + + return false; + } + + if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + return false; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::vaddr); + if (!AddrReg) + return false; + + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + if (isSMRD(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (!OffsetImm) + return false; + + const MachineOperand *SBaseReg = getNamedOperand(*LdSt, + AMDGPU::OpName::sbase); + BaseReg = SBaseReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + return false; +} + +bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + unsigned Opc0 = FirstLdSt->getOpcode(); + unsigned Opc1 = SecondLdSt->getOpcode(); + + // TODO: This needs finer tuning + if (NumLoads > 4) + return false; + + if (isDS(Opc0) && isDS(Opc1)) + return true; + + if (isSMRD(Opc0) && isSMRD(Opc1)) + return true; + + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + return true; + + return false; +} + +void +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + + // If we are trying to copy to or from SCC, there is a bug somewhere else in + // the backend. While it may be theoretically possible to do this, it should + // never be necessary. + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + }; + + static const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + }; + + static const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + }; + + static const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + }; + + static const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, 0 + }; + + unsigned Opcode; + const int16_t *SubIndices; + + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (DestReg == AMDGPU::VCC) { + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_3; + + } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_7; + + } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_15; + + } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || + AMDGPU::SReg_64RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_1; + + } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_2; + + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || + AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_3; + + } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || + AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_7; + + } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || + AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_15; + + } else { + llvm_unreachable("Can't copy register!"); + } + + while (unsigned SubIdx = *SubIndices++) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, SubIdx)); + + Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + + if (*SubIndices) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + } +} + +unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + + int NewOpc; + + // Try to map original to commuted opcode + NewOpc = AMDGPU::getCommuteRev(Opcode); + // Check if the commuted (REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) + return NewOpc; + + // Try to map commuted to original opcode + NewOpc = AMDGPU::getCommuteOrig(Opcode); + // Check if the original (non-REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) + return NewOpc; + + return Opcode; +} + +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + return AMDGPU::V_MOV_B64_PSEUDO; + } + return AMDGPU::COPY; +} + +void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; + + if (RI.isSGPRClass(RC)) { + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling + // SGPRs. + switch (RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; + } + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { + MFI->setHasSpilledVGPRs(); + + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + } + } + + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) + .addReg(SrcReg); + } +} + +void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; + + if (RI.isSGPRClass(RC)){ + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; + } + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + } + } + + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + + } else { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + } +} + +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); + unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); + unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); + } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; +} + +void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, + int Count) const { + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + switch (MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + + case AMDGPU::SI_CONSTDATA_PTR: { + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + + BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); + + // Add 32-bit offset from this instruction to the start of the constant data. + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addTargetIndex(AMDGPU::TI_CONSTDATA_START) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) + .addReg(AMDGPU::SCC, RegState::Implicit); + MI->eraseFromParent(); + break; + } + case AMDGPU::SGPR_USE: + // This is just a placeholder for register allocation. + MI->eraseFromParent(); + break; + + case AMDGPU::V_MOV_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI->getOperand(1); + // FIXME: Will this work for 64-bit floating point immediates? + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + } else { + assert(SrcOp.isReg()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit); + } + MI->eraseFromParent(); + break; + } + + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + unsigned Src0 = MI->getOperand(1).getReg(); + unsigned Src1 = MI->getOperand(2).getReg(); + const MachineOperand &SrcCond = MI->getOperand(3); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addOperand(SrcCond); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addOperand(SrcCond); + MI->eraseFromParent(); + break; + } + } + return true; +} + +MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI) const { + + if (MI->getNumOperands() < 3) + return nullptr; + + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + assert(Src0Idx != -1 && "Should always have src0 operand"); + + MachineOperand &Src0 = MI->getOperand(Src0Idx); + if (!Src0.isReg()) + return nullptr; + + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + if (Src1Idx == -1) + return nullptr; + + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // Make sure it's legal to commute operands for VOP2. + if (isVOP2(MI->getOpcode()) && + (!isOperandLegal(MI, Src0Idx, &Src1) || + !isOperandLegal(MI, Src1Idx, &Src0))) { + return nullptr; + } + + if (!Src1.isReg()) { + // Allow commuting instructions with Imm operands. + if (NewMI || !Src1.isImm() || + (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { + return nullptr; + } + + // Be sure to copy the source modifiers to the right place. + if (MachineOperand *Src0Mods + = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods + = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + + int Src0ModsVal = Src0Mods->getImm(); + if (!Src1Mods && Src0ModsVal != 0) + return nullptr; + + // XXX - This assert might be a lie. It might be useful to have a neg + // modifier with 0.0. + int Src1ModsVal = Src1Mods->getImm(); + assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + } + + unsigned Reg = Src0.getReg(); + unsigned SubReg = Src0.getSubReg(); + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else + llvm_unreachable("Should only have immediates"); + + Src1.ChangeToRegister(Reg, false); + Src1.setSubReg(SubReg); + } else { + MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + } + + if (MI) + MI->setDesc(get(commuteOpcode(*MI))); + + return MI; +} + +// This needs to be implemented because the source modifiers may be inserted +// between the true commutable operands, and the base +// TargetInstrInfo::commuteInstruction uses it. +bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isCommutable()) + return false; + + unsigned Opc = MI->getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + + // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on + // immediate. + if (!MI->getOperand(Src0Idx).isReg()) + return false; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; + + if (!MI->getOperand(Src1Idx).isReg()) + return false; + + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + + SrcOpIdx1 = Src0Idx; + SrcOpIdx2 = Src1Idx; + return true; +} + +MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, + unsigned SrcReg) const { + return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), + DstReg) .addReg(SrcReg); +} + +bool SIInstrInfo::isMov(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + } +} + +bool +SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + return RC != &AMDGPU::EXECRegRegClass; +} + +static void removeModOperands(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src0_modifiers); + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src1_modifiers); + int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2_modifiers); + + MI.RemoveOperand(Src2ModIdx); + MI.RemoveOperand(Src1ModIdx); + MI.RemoveOperand(Src0ModIdx); +} + +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned Opc = UseMI->getOpcode(); + if (Opc == AMDGPU::V_MAD_F32) { + // Don't fold if we are using source modifiers. The new VOP2 instructions + // don't have them. + if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + return false; + } + + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + + // Multiplied part is the constant: Use v_madmk_f32 + // We should only expect these to be on src0 due to canonicalizations. + if (Src0->isReg() && Src0->getReg() == Reg) { + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + if (!Src2->isReg() || + (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + return false; + + // We need to do some weird looking operand shuffling since the madmk + // operands are out of the normal expected order with the multiplied + // constant as the last operand. + // + // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 + // src0 -> src2 K + // src1 -> src0 + // src2 -> src1 + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + unsigned Src1Reg = Src1->getReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned Src2Reg = Src2->getReg(); + unsigned Src2SubReg = Src2->getSubReg(); + Src0->setReg(Src1Reg); + Src0->setSubReg(Src1SubReg); + Src0->setIsKill(Src1->isKill()); + + Src1->setReg(Src2Reg); + Src1->setSubReg(Src2SubReg); + Src1->setIsKill(Src2->isKill()); + + Src2->ChangeToImmediate(Imm); + + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + + // Added part is the constant: Use v_madak_f32 + if (Src2->isReg() && Src2->getReg() == Reg) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + if (!Src0->isImm() && + (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + Src2->ChangeToImmediate(Imm); + + // These come before src2. + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + } + + return false; +} + +bool +SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + return MI->getOperand(1).isImm(); + } +} + +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; +} + +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } + + return false; +} + +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + unsigned Opc0 = MIa->getOpcode(); + unsigned Opc1 = MIb->getOpcode(); + + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; + + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying addres space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(Opc0)) { + if (isDS(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1); + } + + if (isMUBUF(Opc0) || isMTBUF(Opc0)) { + if (isMUBUF(Opc1) || isMTBUF(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isSMRD(Opc1); + } + + if (isSMRD(Opc0)) { + if (isSMRD(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + } + + if (isFLAT(Opc0)) { + if (isFLAT(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; +} + +bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { + int64_t SVal = Imm.getSExtValue(); + if (SVal >= -16 && SVal <= 64) + return true; + + if (Imm.getBitWidth() == 64) { + uint64_t Val = Imm.getZExtValue(); + return (DoubleToBits(0.0) == Val) || + (DoubleToBits(1.0) == Val) || + (DoubleToBits(-1.0) == Val) || + (DoubleToBits(0.5) == Val) || + (DoubleToBits(-0.5) == Val) || + (DoubleToBits(2.0) == Val) || + (DoubleToBits(-2.0) == Val) || + (DoubleToBits(4.0) == Val) || + (DoubleToBits(-4.0) == Val); + } + + // The actual type of the operand does not seem to matter as long + // as the bits match one of the inline immediate values. For example: + // + // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, + // so it is a legal inline immediate. + // + // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in + // floating-point, so it is a legal inline immediate. + uint32_t Val = Imm.getZExtValue(); + + return (FloatToBits(0.0f) == Val) || + (FloatToBits(1.0f) == Val) || + (FloatToBits(-1.0f) == Val) || + (FloatToBits(0.5f) == Val) || + (FloatToBits(-0.5f) == Val) || + (FloatToBits(2.0f) == Val) || + (FloatToBits(-2.0f) == Val) || + (FloatToBits(4.0f) == Val) || + (FloatToBits(-4.0f) == Val); +} + +bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, + unsigned OpSize) const { + if (MO.isImm()) { + // MachineOperand provides no way to tell the true operand size, since it + // only records a 64-bit value. We need to know the size to determine if a + // 32-bit floating point immediate bit pattern is legal for an integer + // immediate. It would be for any 32-bit integer operand, but would not be + // for a 64-bit one. + + unsigned BitSize = 8 * OpSize; + return isInlineConstant(APInt(BitSize, MO.getImm(), true)); + } + + return false; +} + +bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, + unsigned OpSize) const { + return MO.isImm() && !isInlineConstant(MO, OpSize); +} + +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); + if (isLiteralConstant(MO, OpSize)) + return RI.opCanUseLiteralConstant(OpInfo.OperandType); + + return RI.opCanUseInlineConstant(OpInfo.OperandType); +} + +bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + int Op32 = AMDGPU::getVOPe32(Opcode); + if (Op32 == -1) + return false; + + return pseudoToMCOpcode(Op32) != -1; +} + +bool SIInstrInfo::hasModifiers(unsigned Opcode) const { + // The src0_modifier operand is present on all instructions + // that have modifiers. + + return AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::src0_modifiers) != -1; +} + +bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const { + const MachineOperand *Mods = getNamedOperand(MI, OpName); + return Mods && Mods->getImm(); +} + +bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const { + // Literal constants use the constant bus. + if (isLiteralConstant(MO, OpSize)) + return true; + + if (!MO.isReg() || !MO.isUse()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); + + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + return true; + + // EXEC register uses the constant bus. + if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) + return true; + + // SGPRs use the constant bus + if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { + return true; + } + + return false; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const { + uint16_t Opcode = MI->getOpcode(); + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + // Make sure the number of operands is correct. + const MCInstrDesc &Desc = get(Opcode); + if (!Desc.isVariadic() && + Desc.getNumOperands() != MI->getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; + } + + // Make sure the register classes are correct + for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isFPImm()) { + ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " + "all fp values to integers."; + return false; + } + + int RegClass = Desc.OpInfo[i].RegClass; + + switch (Desc.OpInfo[i].OperandType) { + case MCOI::OPERAND_REGISTER: + if (MI->getOperand(i).isImm()) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case AMDGPU::OPERAND_REG_IMM32: + break; + case AMDGPU::OPERAND_REG_INLINE_C: + if (isLiteralConstant(MI->getOperand(i), + RI.getRegClass(RegClass)->getSize())) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case MCOI::OPERAND_IMMEDIATE: + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + ErrInfo = "Expected immediate, but got non-immediate"; + return false; + } + // Fall-through + default: + continue; + } + + if (!MI->getOperand(i).isReg()) + continue; + + if (RegClass != -1) { + unsigned Reg = MI->getOperand(i).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + const TargetRegisterClass *RC = RI.getRegClass(RegClass); + if (!RC->contains(Reg)) { + ErrInfo = "Operand has incorrect register class."; + return false; + } + } + } + + + // Verify VOP* + if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + unsigned ConstantBusCount = 0; + unsigned SGPRUsed = AMDGPU::NoRegister; + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; + const MachineOperand &MO = MI->getOperand(OpIdx); + if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (MO.isReg()) { + if (MO.getReg() != SGPRUsed) + ++ConstantBusCount; + SGPRUsed = MO.getReg(); + } else { + ++ConstantBusCount; + } + } + } + if (ConstantBusCount > 1) { + ErrInfo = "VOP* instruction uses the constant bus more than once"; + return false; + } + } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + const MachineOperand &Src0 = MI->getOperand(Src0Idx); + const MachineOperand &Src1 = MI->getOperand(Src1Idx); + const MachineOperand &Src2 = MI->getOperand(Src2Idx); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + + return true; +} + +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: return AMDGPU::INSTRUCTION_LIST_END; + case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; + case AMDGPU::COPY: return AMDGPU::COPY; + case AMDGPU::PHI: return AMDGPU::PHI; + case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; + case AMDGPU::S_MOV_B32: + return MI.getOperand(1).isReg() ? + AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; + case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; + case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; + case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; + case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; + case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; + case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; + case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; + case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; + case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; + case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; + case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; + case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; + case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; + case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + } +} + +bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { + return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; +} + +const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &Desc = get(MI.getOpcode()); + if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || + Desc.OpInfo[OpNo].RegClass == -1) { + unsigned Reg = MI.getOperand(OpNo).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + return RI.getPhysRegClass(Reg); + } + + unsigned RCID = Desc.OpInfo[OpNo].RegClass; + return RI.getRegClass(RCID); +} + +bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: + return RI.hasVGPRs(getOpRegClass(MI, 0)); + default: + return RI.hasVGPRs(getOpRegClass(MI, OpNo)); + } +} + +void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &MO = MI->getOperand(OpIdx); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + const TargetRegisterClass *RC = RI.getRegClass(RCID); + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (MO.isReg()) + Opcode = AMDGPU::COPY; + else if (RI.isSGPRClass(RC)) + Opcode = AMDGPU::S_MOV_B32; + + + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); + if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) + VRC = &AMDGPU::VReg_64RegClass; + else + VRC = &AMDGPU::VGPR_32RegClass; + + unsigned Reg = MRI.createVirtualRegister(VRC); + DebugLoc DL = MBB->findDebugLoc(I); + BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) + .addOperand(MO); + MO.ChangeToRegister(Reg, false); +} + +unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) + const { + assert(SuperReg.isReg()); + + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + unsigned SubReg = MRI.createVirtualRegister(SubRC); + + // Just in case the super register is itself a sub-register, copy it to a new + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to + // eliminate this extra copy. + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) + .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(NewSuperReg, 0, SubIdx); + + return SubReg; +} + +MachineOperand SIInstrInfo::buildExtractSubRegOrImm( + MachineBasicBlock::iterator MII, + MachineRegisterInfo &MRI, + MachineOperand &Op, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const { + if (Op.isImm()) { + // XXX - Is there a better way to do this? + if (SubIdx == AMDGPU::sub0) + return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); + if (SubIdx == AMDGPU::sub1) + return MachineOperand::CreateImm(Op.getImm() >> 32); + + llvm_unreachable("Unhandled register index for immediate"); + } + + unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, + SubIdx, SubRC); + return MachineOperand::CreateReg(SubReg, false); +} + +unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, + MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineOperand &Op) const { + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned Dst = MRI.createVirtualRegister(RC); + + MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + LoDst) + .addImm(Op.getImm() & 0xFFFFFFFF); + MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + HiDst) + .addImm(Op.getImm() >> 32); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) + .addReg(LoDst) + .addImm(AMDGPU::sub0) + .addReg(HiDst) + .addImm(AMDGPU::sub1); + + Worklist.push_back(Lo); + Worklist.push_back(Hi); + + return Dst; +} + +// Change the order of operands from (0, 1, 2) to (0, 2, 1) +void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { + assert(Inst->getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst->getOperand(1); + Inst->RemoveOperand(1); + Inst->addOperand(Op1); +} + +bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO) const { + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const TargetRegisterClass *DefinedRC = + OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + if (!MO) + MO = &MI->getOperand(OpIdx); + + if (isVALU(InstDesc.Opcode) && + usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + unsigned SGPRUsed = + MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() != SGPRUsed && + usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + return false; + } + } + } + + if (MO->isReg()) { + assert(DefinedRC); + const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + } + + + // Handle non-register types that are treated like immediates. + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + + if (!DefinedRC) { + // This operand expects an immediate. + return true; + } + + return isImmOperandLegal(MI, OpIdx, *MO); +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src2); + + // Legalize VOP2 + if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { + // Legalize src0 + if (!isOperandLegal(MI, Src0Idx)) + legalizeOpWithMove(MI, Src0Idx); + + // Legalize src1 + if (isOperandLegal(MI, Src1Idx)) + return; + + // Usually src0 of VOP2 instructions allow more types of inputs + // than src1, so try to commute the instruction to decrease our + // chances of having to insert a MOV instruction to legalize src1. + if (MI->isCommutable()) { + if (commuteInstruction(MI)) + // If we are successful in commuting, then we know MI is legal, so + // we are done. + return; + } + + legalizeOpWithMove(MI, Src1Idx); + return; + } + + // XXX - Do any VOP3 instructions read VCC? + // Legalize VOP3 + if (isVOP3(MI->getOpcode())) { + int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + if (MO.isReg()) { + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; + } + } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { + // If it is not a register and not a literal constant, then it must be + // an inline constant which is always legal. + continue; + } + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } + } + + // Legalize REG_SEQUENCE and PHI + // The register class of the operands much be the same type as the register + // class of the output. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || + MI->getOpcode() == AMDGPU::PHI) { + const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { + if (!MI->getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + continue; + const TargetRegisterClass *OpRC = + MRI.getRegClass(MI->getOperand(i).getReg()); + if (RI.hasVGPRs(OpRC)) { + VRC = OpRC; + } else { + SRC = OpRC; + } + } + + // If any of the operands are VGPR registers, then they all most be + // otherwise we will create illegal VGPR->SGPR copies when legalizing + // them. + if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (!VRC) { + assert(SRC); + VRC = RI.getEquivalentVGPRClass(SRC); + } + RC = VRC; + } else { + RC = SRC; + } + + // Update all the operands so they have the same type. + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { + if (!MI->getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + continue; + unsigned DstReg = MRI.createVirtualRegister(RC); + MachineBasicBlock *InsertBB; + MachineBasicBlock::iterator Insert; + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + InsertBB = MI->getParent(); + Insert = MI; + } else { + // MI is a PHI instruction. + InsertBB = MI->getOperand(i + 1).getMBB(); + Insert = InsertBB->getFirstTerminator(); + } + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), + get(AMDGPU::COPY), DstReg) + .addOperand(MI->getOperand(i)); + MI->getOperand(i).setReg(DstReg); + } + } + + // Legalize INSERT_SUBREG + // src0 must have the same register class as dst + if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned Src0 = MI->getOperand(1).getReg(); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); + if (DstRC != Src0RC) { + MachineBasicBlock &MBB = *MI->getParent(); + unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI->getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MUBUF* instructions + // FIXME: If we start using the non-addr64 instructions for compute, we + // may need to legalize them here. + int SRsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (SRsrcIdx != -1) { + // We have an MUBUF instruction + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), + RI.getRegClass(SRsrcRC))) { + // The operands are legal. + // FIXME: We may need to legalize operands besided srsrc. + return; + } + + MachineBasicBlock &MBB = *MI->getParent(); + // Extract the ptr from the resource descriptor. + + // SRsrcPtrLo = srsrc:sub0 + unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); + + // SRsrcPtrHi = srsrc:sub1 + unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + // Zero64 = 0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), + Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned NewVAddrLo; + unsigned NewVAddrHi; + if (VAddr) { + // This is already an ADDR64 instruction so we need to add the pointer + // extracted from the resource descriptor to the current value of VAddr. + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), + NewVAddrLo) + .addReg(SRsrcPtrLo) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine); + + // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), + NewVAddrHi) + .addReg(SRsrcPtrHi) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine) + .addReg(AMDGPU::VCC, RegState::Implicit); + + } else { + // This instructions is the _OFFSET variant, so we need to convert it to + // ADDR64. + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); + + // Create the new instruction. + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineInstr *Addr64 = + BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0); // tfe + + MI->removeFromParent(); + MI = Addr64; + + NewVAddrLo = SRsrcPtrLo; + NewVAddrHi = SRsrcPtrHi; + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + } + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); + + + // Update the instruction to use NewVaddr + VAddr->setReg(NewVAddr); + // Update the instruction to use NewSRsrc + SRsrc->setReg(NewSRsrc); + } +} + +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes + // on VI. + + bool IsKill = SBase->isKill(); + if (OffOp) { + bool isVI = + MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS; + unsigned OffScale = isVI ? 1 : 4; + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm() * OffScale; + unsigned HiOffset = LoOffset + HalfSize; + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + // Use addReg instead of addOperand + // to make sure kill flag is cleared. + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addImm(LoOffset / OffScale); + + if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset); // The offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addImm(HiOffset / OffScale); + } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addOperand(*SOff) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + break; + default: + llvm_unreachable("Unhandled HalfSize"); + } + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) + .addOperand(MI->getOperand(0)) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); +} + +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { + MachineBasicBlock *MBB = MI->getParent(); + switch (MI->getOpcode()) { + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } + + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) + .addImm(RsrcDataFormat >> 32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(SRsrc); + } else { + MI->getOperand(2).ChangeToRegister(SRsrc, false); + } + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe + + const TargetRegisterClass *NewDstRC = + RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + break; + } + case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX16_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + } +} + +void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { + SmallVector<MachineInstr *, 128> Worklist; + Worklist.push_back(&TopInst); + + while (!Worklist.empty()) { + MachineInstr *Inst = Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + unsigned Opcode = Inst->getOpcode(); + unsigned NewOpcode = getVALUOp(*Inst); + + // Handle some special cases + switch (Opcode) { + default: + if (isSMRD(Inst->getOpcode())) { + moveSMRDToVALU(Inst, MRI); + } + break; + case AMDGPU::S_MOV_B64: { + DebugLoc DL = Inst->getDebugLoc(); + + // If the source operand is a register we can replace this with a + // copy. + if (Inst->getOperand(1).isReg()) { + MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) + .addOperand(Inst->getOperand(0)) + .addOperand(Inst->getOperand(1)); + Worklist.push_back(Copy); + } else { + // Otherwise, we need to split this into two movs, because there is + // no 64-bit VALU move instruction. + unsigned Reg = Inst->getOperand(0).getReg(); + unsigned Dst = split64BitImm(Worklist, + Inst, + MRI, + MRI.getRegClass(Reg), + Inst->getOperand(1)); + MRI.replaceRegWith(Reg, Dst); + } + Inst->eraseFromParent(); + continue; + } + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + + case AMDGPU::S_LSHL_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B64; + swapOperands(Inst); + } + break; + + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst); + continue; + } + + // Use the new VALU Opcode. + const MCInstrDesc &NewDesc = get(NewOpcode); + Inst->setDesc(NewDesc); + + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst->getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) + Inst->RemoveOperand(i); + } + + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Size)); + + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); + } + + addDescImplicitUseDef(NewDesc, Inst); + + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + } + + // Update the destination register class. + + const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); + + switch (Opcode) { + // For target instructions, getOpRegClass just returns the virtual + // register class associated with the operand, so we need to find an + // equivalent VGPR register class in order to move the instruction to the + // VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + continue; + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + continue; + break; + default: + break; + } + + unsigned DstReg = Inst->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + + // Legalize the operands + legalizeOperands(Inst); + + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); + } + } + } +} + +//===----------------------------------------------------------------------===// +// Indirect addressing callbacks +//===----------------------------------------------------------------------===// + +unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + MachineOperand &Src1 = Inst->getOperand(2); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1RC = Src1.isReg() ? + MRI.getRegClass(Src1.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub0, Src1SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + (void) Offset; + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); +} + +void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, + MachineInstr *Inst) const { + // Add the implict and explicit register definitions. + if (NewDesc.ImplicitUses) { + for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { + unsigned Reg = NewDesc.ImplicitUses[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); + } + } + + if (NewDesc.ImplicitDefs) { + for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { + unsigned Reg = NewDesc.ImplicitDefs[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } +} + +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, + int OpIndices[3]) const { + const MCInstrDesc &Desc = get(MI->getOpcode()); + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = AMDGPU::NoRegister; + + // First we need to consider the instruction's operand requirements before + // legalizing. Some operands are required to be SGPRs, such as implicit uses + // of VCC, but we are still bound by the constant bus requirement to only use + // one. + // + // If the operand's class is an SGPR, we can never move it. + + for (const MachineOperand &MO : MI->implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + if (MO.getReg() == AMDGPU::VCC) + return AMDGPU::VCC; + + if (MO.getReg() == AMDGPU::FLAT_SCR) + return AMDGPU::FLAT_SCR; + } + + unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = OpIndices[i]; + if (Idx == -1) + break; + + const MachineOperand &MO = MI->getOperand(Idx); + if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) + SGPRReg = MO.getReg(); + + if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + UsedSGPRs[i] = MO.getReg(); + } + + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; + + // We don't have a required SGPR operand, so we have a bit more freedom in + // selecting operands to move. + + // Try to select the most used SGPR. If an SGPR is equal to one of the + // others, we choose that. + // + // e.g. + // V_FMA_F32 v0, s0, s0, s0 -> No moves + // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + + if (UsedSGPRs[0] != AMDGPU::NoRegister) { + if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[0]; + } + + if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { + if (UsedSGPRs[1] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[1]; + } + + return SGPRReg; +} + +MachineInstrBuilder SIInstrInfo::buildIndirectWrite( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) + .addReg(IndirectBaseReg, RegState::Define) + .addOperand(I->getOperand(0)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0) + .addReg(ValueReg); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectRead( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) + .addOperand(I->getOperand(0)) + .addOperand(I->getOperand(1)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0); + +} + +void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + int End = getIndirectIndexEnd(MF); + int Begin = getIndirectIndexBegin(MF); + + if (End == -1) + return; + + + for (int Index = Begin; Index <= End; ++Index) + Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); +} + +MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} + +uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; + if (ST.isAmdHsaOS()) + RsrcDataFormat |= (1ULL << 56); + + return RsrcDataFormat; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h new file mode 100644 index 0000000..6fafb94 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -0,0 +1,391 @@ +//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "SIDefines.h" +#include "SIRegisterInfo.h" + +namespace llvm { + +class SIInstrInfo : public AMDGPUInstrInfo { +private: + const SIRegisterInfo RI; + + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + + unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, + MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineOperand &Op) const; + + void swapOperands(MachineBasicBlock::iterator Inst) const; + + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + + void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + +public: + explicit SIInstrInfo(const AMDGPUSubtarget &st); + + const SIRegisterInfo &getRegisterInfo() const override { + return RI; + } + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const override; + + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const final; + + bool shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + unsigned commuteOpcode(const MachineInstr &MI) const; + + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI = false) const override; + bool findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + bool isTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA = nullptr) const; + + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + bool isMov(unsigned Opcode) const override; + + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const final; + + unsigned getMachineCSELookAheadLimit() const override { return 500; } + + bool isSALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SALU; + } + + bool isVALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VALU; + } + + bool isSOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP1; + } + + bool isSOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP2; + } + + bool isSOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPC; + } + + bool isSOPK(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK; + } + + bool isSOPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPP; + } + + bool isVOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP1; + } + + bool isVOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP2; + } + + bool isVOP3(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3; + } + + bool isVOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOPC; + } + + bool isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; + } + + bool isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; + } + + bool isSMRD(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SMRD; + } + + bool isDS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DS; + } + + bool isMIMG(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MIMG; + } + + bool isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; + } + + bool isWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::WQM; + } + + bool isVGPRSpill(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; + } + + bool isInlineConstant(const APInt &Imm) const; + bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; + + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + + /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. + /// This function will return false if you pass it a 32-bit instruction. + bool hasVALU32BitEncoding(unsigned Opcode) const; + + /// \brief Returns true if this operand uses the constant bus. + bool usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const; + + /// \brief Return true if this instruction has any modifiers. + /// e.g. src[012]_mod, omod, clamp. + bool hasModifiers(unsigned Opcode) const; + + bool hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const; + + bool verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const override; + + static unsigned getVALUOp(const MachineInstr &MI); + + bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; + + /// \brief Return the correct register class for \p OpNo. For target-specific + /// instructions, this will return the register class that has been defined + /// in tablegen. For generic instructions, like REG_SEQUENCE it will return + /// the register class of its machine operand. + /// to infer the correct register class base on the other operands. + const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const; + + /// \brief Return the size in bytes of the operand OpNo on the given + // instruction opcode. + unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; + + if (OpInfo.RegClass == -1) { + // If this is an immediate operand, this must be a 32-bit literal. + assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); + return 4; + } + + return RI.getRegClass(OpInfo.RegClass)->getSize(); + } + + /// \brief This form should usually be preferred since it handles operands + /// with unknown register classes. + unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + return getOpRegClass(MI, OpNo)->getSize(); + } + + /// \returns true if it is legal for the operand at index \p OpNo + /// to read a VGPR. + bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; + + /// \brief Legalize the \p OpIndex operand of this instruction by inserting + /// a MOV. For example: + /// ADD_I32_e32 VGPR0, 15 + /// to + /// MOV VGPR1, 15 + /// ADD_I32_e32 VGPR0, VGPR1 + /// + /// If the operand being legalized is a register, then a COPY will be used + /// instead of MOV. + void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + + /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// for \p MI. + bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO = nullptr) const; + + /// \brief Legalize all operands in this instruction. This function may + /// create new instruction and insert them before \p MI. + void legalizeOperands(MachineInstr *MI) const; + + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + + /// \brief Replace this instruction's opcode with the equivalent VALU + /// opcode. This function will also move the users of \p MI to the + /// VALU if necessary. + void moveToVALU(MachineInstr &MI) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, + unsigned SavReg, unsigned IndexReg) const; + + void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + + const MachineOperand *getNamedOperand(const MachineInstr &MI, + unsigned OpName) const { + return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); + } + + uint64_t getDefaultRsrcDataFormat() const; + +}; + +namespace AMDGPU { + + int getVOPe64(uint16_t Opcode); + int getVOPe32(uint16_t Opcode); + int getCommuteRev(uint16_t Opcode); + int getCommuteOrig(uint16_t Opcode); + int getAddr64Inst(uint16_t Opcode); + int getAtomicRetOp(uint16_t Opcode); + int getAtomicNoRetOp(uint16_t Opcode); + + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; + const uint64_t RSRC_TID_ENABLE = 1LL << 55; + +} // End namespace AMDGPU + +namespace SI { +namespace KernelInputOffsets { + +/// Offsets in bytes from the start of the input buffer +enum Offsets { + NGROUPS_X = 0, + NGROUPS_Y = 4, + NGROUPS_Z = 8, + GLOBAL_SIZE_X = 12, + GLOBAL_SIZE_Y = 16, + GLOBAL_SIZE_Z = 20, + LOCAL_SIZE_X = 24, + LOCAL_SIZE_Y = 28, + LOCAL_SIZE_Z = 32 +}; + +} // End namespace KernelInputOffsets +} // End namespace SI + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td new file mode 100644 index 0000000..93e4ca7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -0,0 +1,2647 @@ +//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +def isCI : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + +def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; + +class vop { + field bits<9> SI3; + field bits<10> VI3; +} + +class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {0, si{7-0}}; + field bits<10> VI3 = {0, 0, vi{7-0}}; +} + +class vop1 <bits<8> si, bits<8> vi = si> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<10> VI3 = !add(0x140, vi); +} + +class vop2 <bits<6> si, bits<6> vi = si> : vop { + field bits<6> SI = si; + field bits<6> VI = vi; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; + field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; +} + +// Specify a VOP2 opcode for SI and VOP3 opcode for VI +// that doesn't have VOP2 encoding on VI +class vop23 <bits<6> si, bits<10> vi> : vop2 <si> { + let VI3 = vi; +} + +class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { + let SI3 = si; + let VI3 = vi; +} + +class sop1 <bits<8> si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + +class sop2 <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class sopk <bits<5> si, bits<5> vi = si> { + field bits<5> SI = si; + field bits<5> VI = vi; +} + +// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum +// in AMDGPUInstrInfo.cpp +def SISubtarget { + int NONE = -1; + int SI = 0; + int VI = 1; +} + +//===----------------------------------------------------------------------===// +// SI DAG Nodes +//===----------------------------------------------------------------------===// + +def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", + SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, + [SDNPMayLoad, SDNPMemOperand] +>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", + SDTypeProfile<0, 13, + [SDTCisVT<0, v4i32>, // rsrc(SGPR) + SDTCisVT<1, iAny>, // vdata(VGPR) + SDTCisVT<2, i32>, // num_channels(imm) + SDTCisVT<3, i32>, // vaddr(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // offen(imm) + SDTCisVT<9, i32>, // idxen(imm) + SDTCisVT<10, i32>, // glc(imm) + SDTCisVT<11, i32>, // slc(imm) + SDTCisVT<12, i32> // tfe(imm) + ]>, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, + SDTCisVT<3, i32>]> +>; + +class SDSample<string opcode> : SDNode <opcode, + SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>, + SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> +>; + +def SIsample : SDSample<"AMDGPUISD::SAMPLE">; +def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; +def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; +def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; + +def SIconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> +>; + +//===----------------------------------------------------------------------===// +// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 +// to be glued to the memory instructions. +//===----------------------------------------------------------------------===// + +def SIld_local : SDNode <"ISD::LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ + return isLocalLoad(cast<LoadSDNode>(N)); +}]>; + +def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def si_load_local_align8 : Aligned8Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + +def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def si_az_extload_local : AZExtLoadBase <si_ld_local>; + +multiclass SIExtLoadLocal <PatFrag ld_node> { + + def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}] + >; + + def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}] + >; +} + +defm si_sextload_local : SIExtLoadLocal <si_sextload_local>; +defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>; + +def SIst_local : SDNode <"ISD::STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def si_st_local : PatFrag < + (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ + return isLocalStore(cast<StoreSDNode>(N)); +}]>; + +def si_store_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && + !cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_store_local_align8 : Aligned8Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + +def si_truncstore_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->isTruncatingStore(); +}]>; + +def si_truncstore_local_i8 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def si_truncstore_local_i16 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +multiclass SIAtomicM0Glue2 <string op_name> { + + def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] + >; + + def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; +} + +defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; + +def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>; + +// Transformation function, extract the lower 32bit of a 64bit immediate +def LO32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), + MVT::i32); +}]>; + +def LO32f : SDNodeXForm<fpimm, [{ + APInt V = N->getValueAPF().bitcastToAPInt().trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); +}]>; + +// Transformation function, extract the upper 32bit of a 64bit immediate +def HI32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); +}]>; + +def HI32f : SDNodeXForm<fpimm, [{ + APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), + MVT::f32); +}]>; + +def IMM8bitDWORD : PatLeaf <(imm), + [{return (N->getZExtValue() & ~0x3FC) == 0;}] +>; + +def as_dword_i32imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); +}]>; + +def as_i1imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); +}]>; + +def as_i8imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); +}]>; + +def as_i16imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); +}]>; + +def as_i32imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + +def as_i64imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); +}]>; + +def IMM8bit : PatLeaf <(imm), + [{return isUInt<8>(N->getZExtValue());}] +>; + +def IMM12bit : PatLeaf <(imm), + [{return isUInt<12>(N->getZExtValue());}] +>; + +def IMM16bit : PatLeaf <(imm), + [{return isUInt<16>(N->getZExtValue());}] +>; + +def IMM20bit : PatLeaf <(imm), + [{return isUInt<20>(N->getZExtValue());}] +>; + +def IMM32bit : PatLeaf <(imm), + [{return isUInt<32>(N->getZExtValue());}] +>; + +def mubuf_vaddr_offset : PatFrag< + (ops node:$ptr, node:$offset, node:$imm_offset), + (add (add node:$ptr, node:$offset), node:$imm_offset) +>; + +class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ + return isInlineImmediate(N); +}]>; + +class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ + return isInlineImmediate(N); +}]>; + +class SGPRImm <dag frag> : PatLeaf<frag, [{ + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return false; + } + const SIRegisterInfo *SIRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { + return true; + } + } + return false; +}]>; + +//===----------------------------------------------------------------------===// +// Custom Operands +//===----------------------------------------------------------------------===// + +def FRAMEri32 : Operand<iPTR> { + let MIOperandInfo = (ops i32:$ptr, i32imm:$index); +} + +def SoppBrTarget : AsmOperandClass { + let Name = "SoppBrTarget"; + let ParserMethod = "parseSOppBrTarget"; +} + +def sopp_brtarget : Operand<OtherVT> { + let EncoderMethod = "getSOPPBrEncoding"; + let OperandType = "OPERAND_PCREL"; + let ParserMatchClass = SoppBrTarget; +} + +include "SIInstrFormats.td" +include "VIInstrFormats.td" + +def MubufOffsetMatchClass : AsmOperandClass { + let Name = "MubufOffset"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +class DSOffsetBaseMatchClass <string parser> : AsmOperandClass { + let Name = "DSOffset"#parser; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset"; +} + +def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; +def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; + +def DSOffset01MatchClass : AsmOperandClass { + let Name = "DSOffset1"; + let ParserMethod = "parseDSOff01OptionalOps"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset01"; +} + +class GDSBaseMatchClass <string parser> : AsmOperandClass { + let Name = "GDS"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; +def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; + +class GLCBaseMatchClass <string parser> : AsmOperandClass { + let Name = "GLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; +def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; + +class SLCBaseMatchClass <string parser> : AsmOperandClass { + let Name = "SLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; +def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; +def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; + +class TFEBaseMatchClass <string parser> : AsmOperandClass { + let Name = "TFE"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; +def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; +def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; + +def OModMatchClass : AsmOperandClass { + let Name = "OMod"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def ClampMatchClass : AsmOperandClass { + let Name = "Clamp"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +let OperandType = "OPERAND_IMMEDIATE" in { + +def offen : Operand<i1> { + let PrintMethod = "printOffen"; +} +def idxen : Operand<i1> { + let PrintMethod = "printIdxen"; +} +def addr64 : Operand<i1> { + let PrintMethod = "printAddr64"; +} +def mbuf_offset : Operand<i16> { + let PrintMethod = "printMBUFOffset"; + let ParserMatchClass = MubufOffsetMatchClass; +} +class ds_offset_base <AsmOperandClass mc> : Operand<i16> { + let PrintMethod = "printDSOffset"; + let ParserMatchClass = mc; +} +def ds_offset : ds_offset_base <DSOffsetMatchClass>; +def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>; + +def ds_offset0 : Operand<i8> { + let PrintMethod = "printDSOffset0"; + let ParserMatchClass = DSOffset01MatchClass; +} +def ds_offset1 : Operand<i8> { + let PrintMethod = "printDSOffset1"; + let ParserMatchClass = DSOffset01MatchClass; +} +class gds_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printGDS"; + let ParserMatchClass = mc; +} +def gds : gds_base <GDSMatchClass>; + +def gds01 : gds_base <GDS01MatchClass>; + +class glc_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printGLC"; + let ParserMatchClass = mc; +} + +def glc : glc_base <GLCMubufMatchClass>; +def glc_flat : glc_base <GLCFlatMatchClass>; + +class slc_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printSLC"; + let ParserMatchClass = mc; +} + +def slc : slc_base <SLCMubufMatchClass>; +def slc_flat : slc_base <SLCFlatMatchClass>; +def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>; + +class tfe_base <AsmOperandClass mc> : Operand <i1> { + let PrintMethod = "printTFE"; + let ParserMatchClass = mc; +} + +def tfe : tfe_base <TFEMubufMatchClass>; +def tfe_flat : tfe_base <TFEFlatMatchClass>; +def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>; + +def omod : Operand <i32> { + let PrintMethod = "printOModSI"; + let ParserMatchClass = OModMatchClass; +} + +def ClampMod : Operand <i1> { + let PrintMethod = "printClampSI"; + let ParserMatchClass = ClampMatchClass; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +def VOPDstS64 : VOPDstOperand <SReg_64>; + +//===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; + +def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; +def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; +def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; + +def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; +def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; +def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; + +//===----------------------------------------------------------------------===// +// SI assembler operands +//===----------------------------------------------------------------------===// + +def SIOperand { + int ZERO = 0x80; + int VCC = 0x6A; + int FLAT_SCR = 0x68; +} + +def SRCMODS { + int NONE = 0; + int NEG = 1; +} + +def DSTCLAMP { + int NONE = 0; +} + +def DSTOMOD { + int NONE = 0; +} + +//===----------------------------------------------------------------------===// +// +// SI Instruction multiclass helpers. +// +// Instructions with _32 take 32-bit operands. +// Instructions with _64 take 64-bit operands. +// +// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit +// encoding is the standard encoding, but instruction that make use of +// any of the instruction modifiers must use the 64-bit encoding. +// +// Instructions with _e32 use the 32-bit encoding. +// Instructions with _e64 use the 64-bit encoding. +// +//===----------------------------------------------------------------------===// + +class SIMCInstr <string pseudo, int subtarget> { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +//===----------------------------------------------------------------------===// +// EXP classes +//===----------------------------------------------------------------------===// + +class EXPCommon : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), + "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + let EXP_CNT = 1; + let Uses = [EXEC]; +} + +multiclass EXP_m { + + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + } + + def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + + def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; +} + +//===----------------------------------------------------------------------===// +// Scalar classes +//===----------------------------------------------------------------------===// + +class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOP1 <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; +} + +class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; +} + +multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP1_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP1_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern +>; + +multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +// no input, 64-bit output. +multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } +} + +// 64-bit input, no output +multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } +} + +// 64-bit input, 32-bit output. +multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : + SOP2<outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 4; + + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + field bits<7> sdst = 0; +} + +class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; + + def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]">; + + def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]">; +} + +multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP2_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP2_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC < + op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []>; + +class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_32, i32, opName, cond>; + +class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_64, i64, opName, cond>; + +class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOPK <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; +} + +class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; +} + +multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm, + string asm = opName#opAsm> { + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), + pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; + + def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; +} + +multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), pattern>; + + let DisableEncoding = "$dst" in { + def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + + def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + } +} + +multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m < + op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), + " $sdst, $simm16" +>; + +multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, + string argAsm, string asm = opName#argAsm> { + + def "" : SOPK_Pseudo <opName, outs, ins, []>; + + def _si : SOPK <outs, ins, asm, []>, + SOPK64e <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; + } + + def _vi : SOPK <outs, ins, asm, []>, + SOPK64e <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; + } +} +//===----------------------------------------------------------------------===// +// SMRD classes +//===----------------------------------------------------------------------===// + +class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SMRD <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMRDe <op, imm>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMEMe_vi <op, imm>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm, list<dag> pattern> { + + def "" : SMRD_Pseudo <opName, outs, ins, pattern>; + + def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + } +} + +multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, + RegisterClass dstClass> { + defm _IMM : SMRD_m < + op, opName#"_IMM", 1, (outs dstClass:$dst), + (ins baseClass:$sbase, u32imm:$offset), + opName#" $dst, $sbase, $offset", [] + >; + + defm _SGPR : SMRD_m < + op, opName#"_SGPR", 0, (outs dstClass:$dst), + (ins baseClass:$sbase, SReg_32:$soff), + opName#" $dst, $sbase, $soff", [] + >; +} + +//===----------------------------------------------------------------------===// +// Vector ALU classes +//===----------------------------------------------------------------------===// + +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printOperandAndMods"; +} + +def InputModsMatchClass : AsmOperandClass { + let Name = "RegWithInputMods"; +} + +def InputModsNoDefault : Operand <i32> { + let PrintMethod = "printOperandAndMods"; + let ParserMatchClass = InputModsMatchClass; +} + +class getNumSrcArgs<ValueType Src1, ValueType Src2> { + int ret = + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 + 3)); // VOP3 +} + +// Returns the register class to use for the destination of VOP[123C] +// instructions for the given VT. +class getVALUDstForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + VOPDstOperand<SReg_64>)); // else VT == i1 +} + +// Returns the register class to use for source 0 of VOP[12C] +// instructions for the given VT. +class getVOPSrc0ForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); +} + +// Returns the register class to use for source 1 of VOP[12C] for the +// given VT. +class getVOPSrc1ForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); +} + +// Returns the register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3SrcForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); +} + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +class hasModifiers<ValueType SrcVT> { + bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, 0)); +} + +// Returns the input arguments for VOP[12C] instructions for the given SrcVT. +class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { + dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 + !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 + (ins))); +} + +// Returns the input arguments for VOP3 instructions for the given SrcVT. +class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, + bit HasModifiers> { + + dag ret = + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP1 without modifiers + (ins Src0RC:$src0) + /* endif */ ), + !if (!eq(NumSrcArgs, 2), + !if (!eq(HasModifiers, 1), + // VOP 2 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP2 without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + /* endif */ ) + /* NumSrcArgs == 3 */, + !if (!eq(HasModifiers, 1), + // VOP3 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + InputModsNoDefault:$src2_modifiers, Src2RC:$src2, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP3 without modifiers + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + /* endif */ ))); +} + +// Returns the assembly string for the inputs and outputs of a VOP[12C] +// instruction. This does not add the _e32 suffix, so it can be reused +// by getAsm64. +class getAsm32 <int NumSrcArgs> { + string src1 = ", $src1"; + string src2 = ", $src2"; + string ret = "$dst, $src0"# + !if(!eq(NumSrcArgs, 1), "", src1)# + !if(!eq(NumSrcArgs, 3), src2, ""); +} + +// Returns the assembly string for the inputs and outputs of a VOP3 +// instruction. +class getAsm64 <int NumSrcArgs, bit HasModifiers> { + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string ret = + !if(!eq(HasModifiers, 0), + getAsm32<NumSrcArgs>.ret, + "$dst, "#src0#src1#src2#"$clamp"#"$omod"); +} + + +class VOPProfile <list<ValueType> _ArgVT> { + + field list<ValueType> ArgVT = _ArgVT; + + field ValueType DstVT = ArgVT[0]; + field ValueType Src0VT = ArgVT[1]; + field ValueType Src1VT = ArgVT[2]; + field ValueType Src2VT = ArgVT[3]; + field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; + field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; + field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; + field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; + field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + + field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasModifiers = hasModifiers<Src0VT>.ret; + + field dag Outs = (outs DstRC:$dst); + + field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; + field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasModifiers>.ret; + + field string Asm32 = getAsm32<NumSrcArgs>.ret; + field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; +} + +// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order +// for the instruction patterns to work. +def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; + +def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; + +def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; +def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; +def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; +def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; +def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; + +def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; +def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; +def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; +def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + let Src0RC32 = VCSrc_32; +} + +def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$dst, $src0_modifiers, $src1"; +} + +def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$dst, $src0_modifiers, $src1"; +} + +def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; +def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); + let Asm64 = "$dst, $src0, $src1, $src2"; +} + +def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); + field string Asm = "$dst, $src0, $vsrc1, $src2"; +} +def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; +def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; + + +class VOP <string opName> { + string OpName = opName; +} + +class VOP2_REV <string revOp, bit isOrig> { + string RevOp = revOp; + bit IsOrig = isOrig; +} + +class AtomicNoRet <string noRetOp, bit isRet> { + string NoRetOp = noRetOp; + bit IsRet = isRet; +} + +class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP1Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + field bits<8> vdst; + field bits<9> src0; +} + +class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : + VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; +} + +multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP1_Real_si <opName, op, outs, ins, asm>; +} + +class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP2Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : + VOP2 <op.VI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; +} + +multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + + def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + +} + +class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { + + bits<2> src0_modifiers = !if(HasModifiers, ?, 0); + bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); + bits<2> omod = !if(HasModifiers, ?, 0); + bits<1> clamp = !if(HasModifiers, ?, 0); + bits<9> src1 = !if(HasSrc1, ?, 0); + bits<9> src2 = !if(HasSrc2, ?, 0); +} + +class VOP3DisableModFields <bit HasSrc0Mods, + bit HasSrc1Mods = 0, + bit HasSrc2Mods = 0, + bit HasOutputMods = 0> { + bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); + bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); + bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); + bits<2> omod = !if(HasOutputMods, ?, 0); + bits<1> clamp = !if(HasOutputMods, ?, 0); +} + +class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP3Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e64", SISubtarget.NONE>, + MnemonicAlias<opName#"_e64", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; +} + +class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI> { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; +} + +// VOP3_m without source modifiers +multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + let src0_modifiers = 0, + src1_modifiers = 0, + src2_modifiers = 0, + clamp = 0, + omod = 0 in { + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; + } +} + +multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; +} + +multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + // No VI instruction. This class is for SI only. +} + +multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; +} + +multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + // No VI instruction. This class is for SI only. +} + +// XXX - Is v_div_scale_{f32|f64} only available in vop3b without +// option of implicit vcc use? +multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + // The VOP2 variant puts the carry out into VCC, the VOP3 variant + // can write it into any SGPR. We currently don't use the carry out, + // so for now hardcode it to VCC as well. + let sdst = SIOperand.VCC, Defs = [VCC] in { + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + } // End sdst = SIOperand.VCC, Defs = [VCC] +} + +multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; +} + +multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, + bit HasMods, bit defExec, string revOp> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + } + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + } +} + +// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. +multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : VOPAnyCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE>; + } + + def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, + SIMCInstr <opName, SISubtarget.SI> { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op.VI3>, + VOP3DisableFields <1, 0, 0>, + SIMCInstr <opName, SISubtarget.VI> { + let AssemblerPredicates = [isVI]; + } +} + +multiclass VOP1_Helper <vop1 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + bit HasMods> { + + defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; + + defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; +} + +multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP1_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + P.HasModifiers +>; + +multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> { + + defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; + + defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + opName, P.HasModifiers>; +} + +multiclass VOP2_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; + + defm _e64 : VOP3_2_m <op, + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods + >; +} + +multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> { + defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + opName, revOp, P.HasModifiers>; +} + +multiclass VOP2b_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { + + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; + + defm _e64 : VOP3b_2_m <op, + outs, ins64, opName#asm64, pat64, opName, revOp, HasMods + >; +} + +multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2b_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +// A VOP2 instruction that is VOP3-only on VI. +multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { + defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; + + defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, + revOp, HasMods>; +} + +multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> + : VOP2_VI3_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { + + def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>; + +let isCodeGenOnly = 0 in { + def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>, + VOP2_MADKe <op.SI> { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>, + VOP2_MADKe <op.VI> { + let AssemblerPredicates = [isVI]; + } +} // End isCodeGenOnly = 0 +} + +class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOPCCommon <ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE>, + MnemonicAlias<opName#"_e32", opName> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, bit DefExec, string revOpName = ""> { + def "" : VOPC_Pseudo <outs, ins, pattern, opName>; + + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; + } + + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; + } +} + +multiclass VOPC_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec, string revOp> { + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec, string revOp> { + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, + opName, HasMods, DefExec, revOp>, + VOP3DisableModFields<1, 0, 0>; +} + +multiclass VOPCInst <vopc op, string opName, + VOPProfile P, PatLeaf cond = COND_NULL, + string revOp = opName, + bit DefExec = 0> : VOPC_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + P.HasModifiers, DefExec, revOp +>; + +multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, + bit DefExec = 0> : VOPC_Class_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], + [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + P.HasModifiers, DefExec, opName +>; + + +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; + +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; + +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; + +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; + + +multiclass VOPCX <vopc op, string opName, VOPProfile P, + PatLeaf cond = COND_NULL, + string revOp = ""> + : VOPCInst <op, opName, P, cond, revOp, 1>; + +multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; + +multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; + +multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; + +multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : + VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; + +multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, + list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods +>; + +multiclass VOPC_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + +multiclass VOPCX_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + +multiclass VOPC_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + +multiclass VOPCX_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + +multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +multiclass VOP3_VCC_Inst <vop3 op, string opName, + VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, + (outs P.DstRC.RegClass:$dst), + (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, + InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, + InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, + ClampMod:$clamp, + omod:$omod), + " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))], + 3, 1 +>; + +multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, + string opName, list<dag> pattern> : + VOP3b_3_m < + op, (outs vrc:$vdst, SReg_64:$sdst), + (ins InputModsNoDefault:$src0_modifiers, arc:$src0, + InputModsNoDefault:$src1_modifiers, arc:$src1, + InputModsNoDefault:$src2_modifiers, arc:$src2, + ClampMod:$clamp, omod:$omod), + opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, + opName, opName, 1, 1 +>; + +multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; + +multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; + + +class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), + (Inst i32:$src0_modifiers, P.Src0VT:$src0, + i32:$src1_modifiers, P.Src1VT:$src1, + i32:$src2_modifiers, P.Src2VT:$src2, + i1:$clamp, + i32:$omod)>; + +//===----------------------------------------------------------------------===// +// Interpolation opcodes +//===----------------------------------------------------------------------===// + +class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + VINTRPCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe_vi <op>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, + list<dag> pattern = []> { + def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>; + + def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>; + + def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>; +} + +//===----------------------------------------------------------------------===// +// Vector I/O classes +//===----------------------------------------------------------------------===// + +class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + DS <outs, ins, "", pattern>, + SIMCInstr <opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI> { + let isCodeGenOnly = 0; +} + +class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_si <op,opName, outs, ins, asm> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; + let isCodeGenOnly = 0; +} + +class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS_Real_vi <op, opName, outs, ins, asm> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; +} + +multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + gds01:$gds), + string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<opName, 0>; + + let data1 = 0, vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, + string noRetOp = "", dag ins, + dag outs = (outs rc:$vdst), + string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; +} + +multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = "", RegisterClass src = rc> : + DS_1A2D_RET_m <op, asm, rc, noRetOp, + (ins VGPR_32:$addr, src:$data0, src:$data1, + ds_offset:$offset, gds:$gds) +>; + +multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, + string noRetOp = opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 0>; + + let vdst = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_0A_RET <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let addr = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end addr = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +multiclass DS_1A_RET_GDS <bits<8> op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + string asm = opName#" $vdst, $addr"#"$offset gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, gds = 1 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // end data0 = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A_GDS <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr), + string asm = opName#" $addr gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } // end vdst = 0, data = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>; + + let vdst = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } // let vdst = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MTBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, + string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MTBUF_Pseudo <opName, outs, ins, pattern>; + + def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; + + def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; + +} + +let mayStore = 1, mayLoad = 0 in { + +multiclass MTBUF_Store_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs), + (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayStore = 1, mayLoad = 0 + +let mayLoad = 1, mayStore = 0 in { + +multiclass MTBUF_Load_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayLoad = 1, mayStore = 0 + +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// + +class mubuf <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +let isCodeGenOnly = 0 in { + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { + let lds = 0; +} + +} // End let isCodeGenOnly = 0 + +class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { + let lds = 0; +} + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MUBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + // dummy fields, so that we can use let statements around multiclasses + bits<1> offen; + bits<1> idxen; + bits<8> vaddr; + bits<1> glc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let lds = 0; +} + +class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe_vi <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { + let lds = 0; +} + +multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0>; + + let addr64 = 0, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; +} + +multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, + dag ins, string asm, list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1>; + + let addr64 = 1, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_OFFSET", is_return>; + + let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_ADDR64", is_return>; + + let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, + ValueType vt, SDPatternOperator atomic> { + + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + + // No return variants + let glc = 0 in { + + defm _ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_addr64", (outs), + (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + >; + + defm _OFFSET : MUBUFAtomicOffset_m < + op, name#"_offset", (outs), + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + >; + } // glc = 0 + + // Variant that return values + let glc = 1, Constraints = "$vdata = $vdata_in", + DisableEncoding = "$vdata_in" in { + + defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_rtn_addr64", (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", + [(set vt:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$vdata_in))], 1 + >; + + defm _RTN_OFFSET : MUBUFAtomicOffset_m < + op, name#"_rtn_offset", (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + [(set vt:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), vt:$vdata_in))], 1 + >; + + } // glc = 1 + + } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 +} + +multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + let mayLoad = 1, mayStore = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), + (ins SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>; + } + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), + (ins VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), + (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), + (ins VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, + glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"# + "$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe)))]>; + } + } +} + +multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, + ValueType store_vt = i32, SDPatternOperator st = null_frag> { + let mayLoad = 0, mayStore = 1 in { + defm : MUBUF_m <op, name, (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; + + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; + } // offen = 0, idxen = 0, vaddr = 0 + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; + } // end offen = 1, idxen = 0 + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), + (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), + (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"# + "$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe))]>; + } + } // End mayLoad = 0, mayStore = 1 +} + +class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : + FLAT <op, (outs regClass:$vdst), + (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> { + let data = 0; + let mayLoad = 1; +} + +class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : + FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr, + glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + name#" $data, $addr"#"$glc"#"$slc"#"$tfe", + []> { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let vdst = 0; +} + +multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc> { + + let mayLoad = 1, mayStore = 1 in { + def "" : FLAT <op, (outs), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + name#" $addr, $data"#"$slc"#"$tfe", []>, + AtomicNoRet <NAME, 0> { + let glc = 0; + let vdst = 0; + } + + def _RTN : FLAT <op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>, + AtomicNoRet <NAME, 1> { + let glc = 1; + } + } +} + +class MIMG_Mask <string op, int channels> { + string Op = op; + int Channels = channels; +} + +class MIMG_NoSampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc", + []> { + let ssamp = 0; + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; +} + +multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; +} + +multiclass MIMG_NoSampler <bits<7> op, string asm> { + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; +} + +class MIMG_Sampler_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let WQM = wqm; +} + +multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Sampler <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>; +} + +class MIMG_Gather_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; + let WQM = wqm; +} + +multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Gather <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Gather_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>; +} + +//===----------------------------------------------------------------------===// +// Vector instruction mappings +//===----------------------------------------------------------------------===// + +// Maps an opcode in e32 form to its e64 equivalent +def getVOPe64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + +def getMaskedMIMGOp : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["4"]; + let ValueCols = [["1"], ["2"], ["3"] ]; +} + +// Maps an commuted opcode to its original version +def getCommuteOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getCommuteCmpOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteCmpRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + + +def getMCOpcodeGen : InstrMapping { + let FilterClass = "SIMCInstr"; + let RowFields = ["PseudoInstr"]; + let ColFields = ["Subtarget"]; + let KeyCol = [!cast<string>(SISubtarget.NONE)]; + let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]]; +} + +def getAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its version with a return value. +def getAtomicRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its returnless version. +def getAtomicNoRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +include "SIInstructions.td" +include "CIInstructions.td" +include "VIInstructions.td" diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td new file mode 100644 index 0000000..8c8d836 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -0,0 +1,3327 @@ +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file was originally auto-generated from a GPU register header file and +// all the instruction definitions were originally commented out. Instructions +// that are not yet supported remain commented out. +//===----------------------------------------------------------------------===// + +class InterpSlots { +int P0 = 2; +int P10 = 0; +int P20 = 1; +} +def INTERP : InterpSlots; + +def InterpSlot : Operand<i32> { + let PrintMethod = "printInterpSlot"; +} + +def SendMsgImm : Operand<i32> { + let PrintMethod = "printSendMsg"; +} + +def isGCN : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureGCN">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; + +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; + +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} + +def WAIT_FLAG : InstFlag<"printWaitFlag"> { + let ParserMatchClass = SWaitMatchClass; +} + +let SubtargetPredicate = isGCN in { + +//===----------------------------------------------------------------------===// +// EXP Instructions +//===----------------------------------------------------------------------===// + +defm EXP : EXP_m; + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 1 in { + +// We are using the SGPR_32 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SGPR_32 register class does not include M0 +// and writing to M0 from an SMRD instruction will hang the GPU. +defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; + +defm S_BUFFER_LOAD_DWORD : SMRD_Helper < + 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 +>; + +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < + 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 +>; + +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < + 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < + 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < + 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 +>; + +} // mayLoad = 1 + +//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; + +//===----------------------------------------------------------------------===// +// SOP1 Instructions +//===----------------------------------------------------------------------===// + +let isMoveImm = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; + defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; + } // let isRematerializeable = 1 + + let Uses = [SCC] in { + defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; + defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>; + } // End Uses = [SCC] +} // End isMoveImm = 1 + +let Defs = [SCC] in { + defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", + [(set i32:$dst, (not i32:$src0))] + >; + + defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", + [(set i64:$dst, (not i64:$src0))] + >; + defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; + defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; +} // End Defs = [SCC] + + +defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", + [(set i32:$dst, (AMDGPUbrev i32:$src0))] +>; +defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; + +let Defs = [SCC] in { + defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; + defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; + defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", + [(set i32:$dst, (ctpop i32:$src0))] + >; + defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; +} // End Defs = [SCC] + +defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; +defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; +defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", + [(set i32:$dst, (cttz_zero_undef i32:$src0))] +>; +defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; + +defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", + [(set i32:$dst, (ctlz_zero_undef i32:$src0))] +>; + +defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", + [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] +>; +defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; +defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", + [(set i32:$dst, (sext_inreg i32:$src0, i8))] +>; +defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", + [(set i32:$dst, (sext_inreg i32:$src0, i16))] +>; + +defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; +defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; +defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; +defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; +defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; +defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>; +defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>; +defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>; +defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>; +defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; +defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; +defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; +defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; +defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; +defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; +defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; +let Defs = [SCC] in { + defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; +} // End Defs = [SCC] +defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>; + +//===----------------------------------------------------------------------===// +// SOP2 Instructions +//===----------------------------------------------------------------------===// + +let Defs = [SCC] in { // Carry out goes to SCC +let isCommutable = 1 in { +defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; +defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", + [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] +>; +} // End isCommutable = 1 + +defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; +defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", + [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] +>; + +let Uses = [SCC] in { // Carry in comes from SCC +let isCommutable = 1 in { +defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", + [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End isCommutable = 1 + +defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", + [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End Uses = [SCC] + +defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", + [(set i32:$dst, (smin i32:$src0, i32:$src1))] +>; +defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", + [(set i32:$dst, (umin i32:$src0, i32:$src1))] +>; +defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", + [(set i32:$dst, (smax i32:$src0, i32:$src1))] +>; +defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", + [(set i32:$dst, (umax i32:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + + +let Uses = [SCC] in { + defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>; + defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; +} // End Uses = [SCC] + +let Defs = [SCC] in { +defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", + [(set i32:$dst, (and i32:$src0, i32:$src1))] +>; + +defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", + [(set i64:$dst, (and i64:$src0, i64:$src1))] +>; + +defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", + [(set i32:$dst, (or i32:$src0, i32:$src1))] +>; + +defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", + [(set i64:$dst, (or i64:$src0, i64:$src1))] +>; + +defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", + [(set i32:$dst, (xor i32:$src0, i32:$src1))] +>; + +defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", + [(set i64:$dst, (xor i64:$src0, i64:$src1))] +>; +defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; +defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; +defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>; +defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>; +defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>; +defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>; +defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>; +defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>; +defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>; +defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>; +} // End Defs = [SCC] + +// Use added complexity so these patterns are preferred to the VALU patterns. +let AddedComplexity = 1 in { +let Defs = [SCC] in { + +defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; +defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", + [(set i64:$dst, (shl i64:$src0, i32:$src1))] +>; +defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; +defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", + [(set i64:$dst, (srl i64:$src0, i32:$src1))] +>; +defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] +>; +defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", + [(set i64:$dst, (sra i64:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; +defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", + [(set i32:$dst, (mul i32:$src0, i32:$src1))] +>; + +} // End AddedComplexity = 1 + +let Defs = [SCC] in { +defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; +defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; +defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; +defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; +} // End Defs = [SCC] + +let sdst = 0 in { +defm S_CBRANCH_G_FORK : SOP2_m < + sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] +>; +} + +let Defs = [SCC] in { +defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; +} // End Defs = [SCC] + +//===----------------------------------------------------------------------===// +// SOPC Instructions +//===----------------------------------------------------------------------===// + +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; + +//===----------------------------------------------------------------------===// +// SOPK Instructions +//===----------------------------------------------------------------------===// + +let isReMaterializable = 1 in { +defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; +} // End isReMaterializable = 1 +let Uses = [SCC] in { + defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>; +} + +let isCompare = 1 in { + +/* +This instruction is disabled for now until we can figure out how to teach +the instruction selector to correctly use the S_CMP* vs V_CMP* +instructions. + +When this instruction is enabled the code generator sometimes produces this +invalid sequence: + +SCC = S_CMPK_EQ_I32 SGPR0, imm +VCC = COPY SCC +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 + +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", + [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] +>; +*/ + +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>; +defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; +defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; +defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; +defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>; +defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>; +defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>; +defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>; +defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>; +defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>; +defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; +defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; +} // End isCompare = 1 + +let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", + Constraints = "$sdst = $src0" in { + defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>; + defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>; +} + +defm S_CBRANCH_I_FORK : SOPK_m < + sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), + (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; +defm S_SETREG_B32 : SOPK_m < + sopk<0x13, 0x12>, "s_setreg_b32", (outs), + (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +// FIXME: Not on SI? +//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; +defm S_SETREG_IMM32_B32 : SOPK_IMM32 < + sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), + (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" +>; + +//===----------------------------------------------------------------------===// +// SOPP Instructions +//===----------------------------------------------------------------------===// + +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", + [(IL_retflag)]> { + let simm16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; +} + +let isBranch = 1 in { +def S_BRANCH : SOPP < + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", + [(br bb:$simm16)]> { + let isBarrier = 1; +} + +let DisableEncoding = "$scc" in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc0 $simm16" +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc1 $simm16" +>; +} // End DisableEncoding = "$scc" + +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccz $simm16" +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccnz $simm16" +>; + +let DisableEncoding = "$exec" in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execz $simm16" +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execnz $simm16" +>; +} // End DisableEncoding = "$exec" + + +} // End isBranch = 1 +} // End isTerminator = 1 + +let hasSideEffects = 1 in { +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", + [(int_AMDGPU_barrier_local)] +> { + let simm16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; + +let Uses = [EXEC, M0] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] + >; +} // End Uses = [EXEC, M0] + +def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} +} // End hasSideEffects + +//===----------------------------------------------------------------------===// +// VOPC Instructions +//===----------------------------------------------------------------------===// + +let isCompare = 1, isCommutable = 1 in { + +defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; +defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; +defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; +defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; +defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; + + +defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; + + +defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; +defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; +defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; +defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; +defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; + + +defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; + + +let SubtargetPredicate = isSICI in { + +defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; + + +defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; + + +defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; + + +defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; + +} // End SubtargetPredicate = isSICI + +defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; +defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; +defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; + + +defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; + + +defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; +defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; +defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; + + +defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; + + +defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; +defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; +defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; + + +defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; + + +defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; +defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; +defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; + +defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; + +} // End isCompare = 1, isCommutable = 1 + +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// + +defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +let mayLoad = 0 in { +defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +} +defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; +defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; +defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; +defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; +defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; +let mayLoad = 0 in { +defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; +} +defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < + 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < + 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; +let SubtargetPredicate = isCI in { +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; +} // End isCI +defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; +let mayStore = 0 in { +defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; +defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; +} +defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; +defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; +defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; +defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +} +defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; + +let mayStore = 0 in { +defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; +defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; +} + +defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; +defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; +defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; +defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; +defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; +defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; +defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; +defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; +defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; +defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; +defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; +defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; + +defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; +defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; + +defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; +defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; +defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; +defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; +defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; +defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; +defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; +defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; +defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; +defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; +defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; +defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; + +defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; +defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; + +//let SubtargetPredicate = isCI in { +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 +//} // End isCI + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < + mubuf<0x00>, "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < + mubuf<0x01>, "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < + mubuf<0x02>, "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < + mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < + mubuf<0x04>, "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < + mubuf<0x05>, "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < + mubuf<0x06>, "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < + mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 +>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load +>; + +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < + mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global +>; + +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < + mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global +>; + +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < + mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store +>; + +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store +>; + +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < + mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < + mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global +>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < + mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < + mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global +>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < + mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < + mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < + mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < + mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Atomic < + mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Atomic < + mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI +//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; + +//===----------------------------------------------------------------------===// +// MTBUF Instructions +//===----------------------------------------------------------------------===// + +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; + +//===----------------------------------------------------------------------===// +// MIMG Instructions +//===----------------------------------------------------------------------===// + +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let vdst = 0, src0 = 0 in { +defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +} + +let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 + +let Uses = [EXEC] in { + +// FIXME: Specify SchedRW for READFIRSTLANE_B32 + +def V_READFIRSTLANE_B32 : VOP1 < + 0x00000002, + (outs SReg_32:$vdst), + (ins VGPR_32:$src0), + "v_readfirstlane_b32 $vdst, $src0", + [] +>; + +} + +let SchedRW = [WriteQuarterRate32] in { + +defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", + VOP_I32_F64, fp_to_sint +>; +defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32", + VOP_F64_I32, sint_to_fp +>; +defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32", + VOP_F32_I32, sint_to_fp +>; +defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32", + VOP_F32_I32, uint_to_fp +>; +defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", + VOP_I32_F32, fp_to_uint +>; +defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", + VOP_I32_F32, fp_to_sint +>; +defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", + VOP_I32_F32, fp_to_f16 +>; +defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", + VOP_F32_I32, f16_to_fp +>; +defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32", + VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32", + VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", + VOP_F32_F64, fround +>; +defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32", + VOP_F64_F32, fextend +>; +defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 +>; +defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1", + VOP_F32_I32, AMDGPUcvt_f32_ubyte1 +>; +defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2", + VOP_F32_I32, AMDGPUcvt_f32_ubyte2 +>; +defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3", + VOP_F32_I32, AMDGPUcvt_f32_ubyte3 +>; +defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64", + VOP_I32_F64, fp_to_uint +>; +defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", + VOP_F64_I32, uint_to_fp +>; + +} // let SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", + VOP_F32_F32, AMDGPUfract +>; +defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32", + VOP_F32_F32, ftrunc +>; +defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32", + VOP_F32_F32, fceil +>; +defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32", + VOP_F32_F32, frint +>; +defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32", + VOP_F32_F32, ffloor +>; +defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32", + VOP_F32_F32, fexp2 +>; + +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32", + VOP_F32_F32, flog2 +>; +defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32", + VOP_F32_F32, AMDGPUrcp +>; +defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32", + VOP_F32_F32 +>; +defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", + VOP_F32_F32, AMDGPUrsq +>; + +} //let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64", + VOP_F64_F64, AMDGPUrcp +>; +defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", + VOP_F64_F64, AMDGPUrsq +>; + +} // let SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", + VOP_F32_F32, fsqrt +>; + +let SchedRW = [WriteDouble] in { + +defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", + VOP_F64_F64, fsqrt +>; + +} // let SchedRW = [WriteDouble] + +defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", + VOP_F32_F32, AMDGPUsin +>; +defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", + VOP_F32_F32, AMDGPUcos +>; +defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", + VOP_I32_F64 +>; +defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", + VOP_F64_F64 +>; +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; +defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", + VOP_I32_F32 +>; +defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", + VOP_F32_F32 +>; +let vdst = 0, src0 = 0 in { +defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], + "v_clrexcp" +>; +} +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; + +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let SchedRW = [WriteQuarterRate32] in { + +defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; +defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy +>; + +} // End let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped +>; + +} // End SchedRW = [WriteDouble] + +} // End SubtargetPredicate = isSICI + +//===----------------------------------------------------------------------===// +// VINTRP Instructions +//===----------------------------------------------------------------------===// + +let Uses = [M0] in { + +// FIXME: Specify SchedRW for VINTRP insturctions. + +multiclass V_INTERP_P1_F32_m : VINTRP_m < + 0x00000000, + (outs VGPR_32:$dst), + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), + (i32 imm:$attr)))] +>; + +let OtherPredicates = [has32BankLDS] in { + +defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS] + +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { + +defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" + +let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { + +defm V_INTERP_P2_F32 : VINTRP_m < + 0x00000001, + (outs VGPR_32:$dst), + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" + +defm V_INTERP_MOV_F32 : VINTRP_m < + 0x00000002, + (outs VGPR_32:$dst), + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End Uses = [M0] + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +multiclass V_CNDMASK <vop2 op, string name> { + defm _e32 : VOP2_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], + name, name>; + + defm _e64 : VOP3_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, + name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>; +} + +defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">; + +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", + VOP_F32_F32_F32, fadd +>; + +defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", + VOP_F32_F32_F32, null_frag, "v_sub_f32" +>; +} // End isCommutable = 1 + +let isCommutable = 1 in { + +defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", + VOP_F32_F32_F32, int_AMDGPU_mul +>; + +defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", + VOP_F32_F32_F32, fmul +>; + +defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", + VOP_I32_I32_I32, AMDGPUmul_i24 +>; + +defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24", + VOP_I32_I32_I32 +>; + +defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 +>; + +defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24", + VOP_I32_I32_I32 +>; + +defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, + fminnum>; +defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, + fmaxnum>; +defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>; + +defm V_LSHRREV_B32 : VOP2Inst < + vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshr_b32" +>; + +defm V_ASHRREV_I32 : VOP2Inst < + vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, + "v_ashr_i32" +>; + +defm V_LSHLREV_B32 : VOP2Inst < + vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshl_b32" +>; + +defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; + +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + +defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; + +let isCommutable = 1 in { +defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; +} // End isCommutable = 1 + +let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +// No patterns so that the scalar instructions are always selected. +// The scalar versions will be replaced with vector when needed later. + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", + VOP_I32_I32_I32, add +>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; + +defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", + VOP_I32_I32_I32, null_frag, "v_sub_i32" +>; + +let Uses = [VCC] in { // Carry-in comes from VCC +defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", + VOP_I32_I32_I32_VCC +>; +defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", + VOP_I32_I32_I32_VCC +>; +defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", + VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" +>; + +} // End Uses = [VCC] +} // End isCommutable = 1, Defs = [VCC] + +defm V_READLANE_B32 : VOP2SI_3VI_m < + vop3 <0x001, 0x289>, + "v_readlane_b32", + (outs SReg_32:$vdst), + (ins VGPR_32:$src0, SCSrc_32:$src1), + "v_readlane_b32 $vdst, $src0, $src1" +>; + +defm V_WRITELANE_B32 : VOP2SI_3VI_m < + vop3 <0x002, 0x28a>, + "v_writelane_b32", + (outs VGPR_32:$vdst), + (ins SReg_32:$src0, SCSrc_32:$src1), + "v_writelane_b32 $vdst, $src0, $src1" +>; + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy +>; +defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy +>; + +let isCommutable = 1 in { +defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>; +} // End isCommutable = 1 +} // End let SubtargetPredicate = SICI + +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 + +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", + VOP_I32_I32_I32 +>; +defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; +defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", + VOP_F32_F32_I32, AMDGPUldexp +>; + +defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", + VOP_I32_F32_I32>; // TODO: set "Uses = dst" + +defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; +defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32", + VOP_I32_I32_I32 +>; +defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32", + VOP_I32_I32_I32 +>; + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { +defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32", + VOP_F32_F32_F32_F32, fmad +>; + +defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24", + VOP_I32_I32_I32_I32, AMDGPUmad_i24 +>; +defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", + VOP_I32_I32_I32_I32, AMDGPUmad_u24 +>; +} // End isCommutable = 1 + +defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", + VOP_I32_I32_I32_I32, AMDGPUbfe_u32 +>; +defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", + VOP_I32_I32_I32_I32, AMDGPUbfe_i32 +>; + +defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", + VOP_I32_I32_I32_I32, AMDGPUbfi +>; + +let isCommutable = 1 in { +defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", + VOP_F32_F32_F32_F32, fma +>; +defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", + VOP_F64_F64_F64_F64, fma +>; +} // End isCommutable = 1 + +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; +defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32", + VOP_I32_I32_I32_I32 +>; +defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", + VOP_I32_I32_I32_I32 +>; + +defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; + +defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 +>; +defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 +>; +defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", + VOP_F32_F32_F32_F32 +>; +defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", + VOP_I32_I32_I32_I32 +>; +defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", + VOP_I32_I32_I32_I32 +>; + +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; +defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", + VOP_I32_I32_I32_I32 +>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +defm V_DIV_FIXUP_F32 : VOP3Inst < + vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup +>; + +let SchedRW = [WriteDouble] in { + +defm V_DIV_FIXUP_F64 : VOP3Inst < + vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup +>; + +} // let SchedRW = [WriteDouble] + +let SchedRW = [WriteDouble] in { +let isCommutable = 1 in { + +defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", + VOP_F64_F64_F64, fadd +>; +defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", + VOP_F64_F64_F64, fmul +>; + +defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", + VOP_F64_F64_F64, fminnum +>; +defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", + VOP_F64_F64_F64, fmaxnum +>; + +} // isCommutable = 1 + +defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", + VOP_F64_F64_I32, AMDGPUldexp +>; + +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { + +defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", + VOP_I32_I32_I32 +>; + +defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", + VOP_I32_I32_I32 +>; + +} // isCommutable = 1, SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteFloatFMA, WriteSALU] in { +defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +} + +let SchedRW = [WriteDouble, WriteSALU] in { +// Double precision division pre-scale. +defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, Uses = [VCC] in { + +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", + VOP_F32_F32_F32_F32, AMDGPUdiv_fmas +>; + +let SchedRW = [WriteDouble] in { +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", + VOP_F64_F64_F64_F64, AMDGPUdiv_fmas +>; + +} // End SchedRW = [WriteDouble] +} // End isCommutable = 1 + +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; + +let SchedRW = [WriteDouble] in { +defm V_TRIG_PREOP_F64 : VOP3Inst < + vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop +>; + +} // let SchedRW = [WriteDouble] + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>; +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>; +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>; + +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; + +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = isVI in { + +defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", + VOP_I64_I32_I64 +>; +defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64", + VOP_I64_I32_I64 +>; +defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", + VOP_I64_I32_I64 +>; + +} // End SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// Pseudo Instructions +//===----------------------------------------------------------------------===// +let isCodeGenOnly = 1, isPseudo = 1 in { + +// For use in patterns +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] +>; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands +// pass to enable folding of inline immediates. +def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 + +let hasSideEffects = 1 in { +def SGPR_USE : InstSI <(outs),(ins), "", []>; +} + +// SI pseudo instructions. These are used by the CFG structurizer pass +// and should be lowered to ISA instructions prior to codegen. + +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { +let Uses = [EXEC], Defs = [EXEC] in { + +let isBranch = 1, isTerminator = 1 in { + +def SI_IF: InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, brtarget:$target), + "", + [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target), + "", + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] +> { + let Constraints = "$src = $dst"; +} + +def SI_LOOP : InstSI < + (outs), + (ins SReg_64:$saved, brtarget:$target), + "si_loop $saved, $target", + [(int_SI_loop i64:$saved, bb:$target)] +>; + +} // end isBranch = 1, isTerminator = 1 + +def SI_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src), + "si_else $dst, $src", + [(set i64:$dst, (int_SI_break i64:$src))] +>; + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, SReg_64:$src), + "si_if_break $dst, $vcc, $src", + [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src0, SReg_64:$src1), + "si_else_break $dst, $src0, $src1", + [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] +>; + +def SI_END_CF : InstSI < + (outs), + (ins SReg_64:$saved), + "si_end_cf $saved", + [(int_SI_end_cf i64:$saved)] +>; + +} // End Uses = [EXEC], Defs = [EXEC] + +let Uses = [EXEC], Defs = [EXEC,VCC] in { +def SI_KILL : InstSI < + (outs), + (ins VSrc_32:$src), + "si_kill $src", + [(int_AMDGPU_kill f32:$src)] +>; +} // End Uses = [EXEC], Defs = [EXEC,VCC] + +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 + +let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { + +//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; + +let UseNamedOperandTable = 1 in { + +def SI_RegisterLoad : InstSI < + (outs VGPR_32:$dst, SReg_64:$temp), + (ins FRAMEri32:$addr, i32imm:$chan), + "", [] +> { + let isRegisterLoad = 1; + let mayLoad = 1; +} + +class SIRegStore<dag outs> : InstSI < + outs, + (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), + "", [] +> { + let isRegisterStore = 1; + let mayStore = 1; +} + +let usesCustomInserter = 1 in { +def SI_RegisterStorePseudo : SIRegStore<(outs)>; +} // End usesCustomInserter = 1 +def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; + + +} // End UseNamedOperandTable = 1 + +def SI_INDIRECT_SRC : InstSI < + (outs VGPR_32:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + "si_indirect_src $dst, $temp, $src, $idx, $off", + [] +>; + +class SI_INDIRECT_DST<RegisterClass rc> : InstSI < + (outs rc:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), + "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", + [] +> { + let Constraints = "$src = $dst"; +} + +def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; +def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; +def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; +def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; +def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; + +} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] + +multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { + + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 +} + +// It's unclear whether you can use M0 as the output of v_readlane_b32 +// instructions, so use SGPR_32 register class for spills to prevent +// this from happening. +defm SI_SPILL_S32 : SI_SPILL_SGPR <SGPR_32>; +defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; +defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; +defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; +defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; + +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + let UseNamedOperandTable = 1, VGPRSpill = 1 in { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1, VGPRSpill = 1 +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + +let Defs = [SCC] in { + +def SI_CONSTDATA_PTR : InstSI < + (outs SReg_64:$dst), + (ins), + "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] +>; + +} // End Defs = [SCC] + +} // end IsCodeGenOnly, isPseudo + +} // end SubtargetPredicate = isGCN + +let Predicates = [isGCN] in { + +def : Pat< + (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), + (V_CNDMASK_B32_e64 $src2, $src1, + (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, + DSTCLAMP.NONE, DSTOMOD.NONE)) +>; + +def : Pat < + (int_AMDGPU_kilp), + (SI_KILL 0xbf800000) +>; + +/* int_SI_vs_load_input */ +def : Pat< + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) +>; + +/* int_SI_export */ +def : Pat < + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, + f32:$src0, f32:$src1, f32:$src2, f32:$src3), + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, + $src0, $src1, $src2, $src3) +>; + +//===----------------------------------------------------------------------===// +// SMRD Patterns +//===----------------------------------------------------------------------===// + +multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + + // 1. SI-CI: Offset as 8bit DWORD immediate + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), + (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + + // 1. VI: Offset as 20bit immediate in bytes + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), + (vt (Instr_IMM $sbase, (as_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +let Predicates = [isSICI] in { +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isSICI] + +let Predicates = [isVI] in { +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isVI] + +let Predicates = [isSICI] in { + +// 1. Offset as 8bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) +>; + +} // End Predicates = [isSICI] + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (SIload_constant v4i32:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) +>; + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (ctpop i64:$src)), + (i64 (REG_SEQUENCE SReg_64, + (S_BCNT1_I32_B64 $src), sub0, + (S_MOV_B32 0), sub1)) +>; + +//===----------------------------------------------------------------------===// +// SOP2 Patterns +//===----------------------------------------------------------------------===// + +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. +def : Pat < + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_U32 $src0, $src1) +>; + +//===----------------------------------------------------------------------===// +// SOPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_AMDGPU_barrier_global), + (S_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [UnsafeFPMath] in { + +//def : RcpPat<V_RCP_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F32_e32, f32>; + +def : RsqPat<V_RSQ_F32_e32, f32>; +def : RsqPat<V_RSQ_F64_e32, f64>; +} + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e64 $popcnt, $val) +>; + +def : Pat < + (i32 (select i1:$src0, i32:$src1, i32:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ + +// Image + sampler +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; +} + +// Image only +class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc) +>; + +multiclass ImagePatterns<SDPatternOperator name, string opcode> { + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +// Basic sample +defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; +defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; +defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; +defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison +defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; +defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets +defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; +defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets +defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; + +// Gather opcodes +// Only the variants which make sense are defined. +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; + +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; + +def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; +defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; +defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; + +/* SIsample for simple 1D texture lookup */ +def : Pat < + (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowArrayPattern<SDNode name, MIMG opcode, + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +/* SIsample* for texture lookups consuming more address parameters */ +multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l, + MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b, +MIMG sample_d, MIMG sample_c_d, ValueType addr_type> { + def : SamplePattern <SIsample, sample, addr_type>; + def : SampleRectPattern <SIsample, sample, addr_type>; + def : SampleArrayPattern <SIsample, sample, addr_type>; + def : SampleShadowPattern <SIsample, sample_c, addr_type>; + def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>; + + def : SamplePattern <SIsamplel, sample_l, addr_type>; + def : SampleArrayPattern <SIsamplel, sample_l, addr_type>; + def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>; + def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>; + + def : SamplePattern <SIsampleb, sample_b, addr_type>; + def : SampleArrayPattern <SIsampleb, sample_b, addr_type>; + def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>; + def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>; + + def : SamplePattern <SIsampled, sample_d, addr_type>; + def : SampleArrayPattern <SIsampled, sample_d, addr_type>; + def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>; + def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>; +} + +defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2, + IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2, + IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2, + IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2, + v2i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4, + IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4, + IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4, + IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4, + v4i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8, + IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8, + IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8, + IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8, + v8i32>; +defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, + IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16, + IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16, + IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, + v16i32>; + +/* int_SI_imageload for texture fetches consuming varying address parameters */ +class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> { + def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>; + def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>; +} + +multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> { + def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>; + def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>; +} + +defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>; +defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>; + +defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>; +defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>; + +/* Image resource information */ +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +/********** ============================================ **********/ +/********** Extraction, Insertion, Building and Casting **********/ +/********** ============================================ **********/ + +foreach Index = 0-2 in { + def Extract_Element_v2i32_#Index : Extract_Element < + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2i32_#Index : Insert_Element < + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v2f32_#Index : Extract_Element < + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2f32_#Index : Insert_Element < + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-3 in { + def Extract_Element_v4i32_#Index : Extract_Element < + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4i32_#Index : Insert_Element < + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v4f32_#Index : Extract_Element < + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4f32_#Index : Insert_Element < + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-7 in { + def Extract_Element_v8i32_#Index : Extract_Element < + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8i32_#Index : Insert_Element < + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v8f32_#Index : Extract_Element < + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8f32_#Index : Insert_Element < + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-15 in { + def Extract_Element_v16i32_#Index : Extract_Element < + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16i32_#Index : Insert_Element < + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v16f32_#Index : Extract_Element < + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16f32_#Index : Insert_Element < + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +def : BitConvert <i32, f32, SReg_32>; +def : BitConvert <i32, f32, VGPR_32>; + +def : BitConvert <f32, i32, SReg_32>; +def : BitConvert <f32, i32, VGPR_32>; + +def : BitConvert <i64, f64, VReg_64>; + +def : BitConvert <f64, i64, VReg_64>; + +def : BitConvert <v2f32, v2i32, VReg_64>; +def : BitConvert <v2i32, v2f32, VReg_64>; +def : BitConvert <v2i32, i64, VReg_64>; +def : BitConvert <i64, v2i32, VReg_64>; +def : BitConvert <v2f32, i64, VReg_64>; +def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2i32, VReg_64>; +def : BitConvert <v4f32, v4i32, VReg_128>; +def : BitConvert <v4i32, v4f32, VReg_128>; + +def : BitConvert <v8f32, v8i32, SReg_256>; +def : BitConvert <v8i32, v8f32, SReg_256>; +def : BitConvert <v8i32, v32i8, SReg_256>; +def : BitConvert <v32i8, v8i32, SReg_256>; +def : BitConvert <v8i32, v32i8, VReg_256>; +def : BitConvert <v8i32, v8f32, VReg_256>; +def : BitConvert <v8f32, v8i32, VReg_256>; +def : BitConvert <v32i8, v8i32, VReg_256>; + +def : BitConvert <v16i32, v16f32, VReg_512>; +def : BitConvert <v16f32, v16i32, VReg_512>; + +/********** =================== **********/ +/********** Src & Dst modifiers **********/ +/********** =================== **********/ + +def : Pat < + (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), + (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) +>; + +/********** ================================ **********/ +/********** Floating point absolute/negative **********/ +/********** ================================ **********/ + +// Prevent expanding both fneg and fabs. + +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f32:$src)), + (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ +>; + +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. + sub1) +>; + +def : Pat < + (fabs f32:$src), + (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) +>; + +def : Pat < + (fneg f32:$src), + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) +>; + +def : Pat < + (fabs f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + sub1) +>; + +def : Pat < + (fneg f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), + sub1) +>; + +/********** ================== **********/ +/********** Immediate Patterns **********/ +/********** ================== **********/ + +def : Pat < + (SGPRImm<(i32 imm)>:$imm), + (S_MOV_B32 imm:$imm) +>; + +def : Pat < + (SGPRImm<(f32 fpimm)>:$imm), + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i32 imm:$imm), + (V_MOV_B32_e32 imm:$imm) +>; + +def : Pat < + (f32 fpimm:$imm), + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i64 InlineImm<i64>:$imm), + (S_MOV_B64 InlineImm<i64>:$imm) +>; + +// XXX - Should this use a s_cmp to set SCC? + +// Set to sign-extended 64-bit value (true = -1, false = 0) +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 (i64 (as_i64imm $imm))) +>; + +def : Pat < + (f64 InlineFPImm<f64>:$imm), + (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) +>; + +/********** ================== **********/ +/********** Intrinsic Patterns **********/ +/********** ================== **********/ + +/* llvm.AMDGPU.pow */ +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; + +def : Pat < + (int_AMDGPU_div f32:$src0, f32:$src1), + (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) +>; + +def : Pat < + (int_AMDGPU_cube v4f32:$src), + (REG_SEQUENCE VReg_128, + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub0, + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub1, + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub2, + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub3) +>; + +def : Pat < + (i32 (sext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) +>; + +class Ext32Pat <SDNode ext> : Pat < + (i32 (ext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) +>; + +def : Ext32Pat <zext>; +def : Ext32Pat <anyext>; + +// Offset in an 32Bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) +>; + +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (V_CVT_U32_F32_e32 + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) +>; + +def : Pat < + (int_SI_tid), + (V_MBCNT_HI_U32_B32_e64 0xffffffff, + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) +>; + +//===----------------------------------------------------------------------===// +// VOP3 Patterns +//===----------------------------------------------------------------------===// + +def : IMad24Pat<V_MAD_I32_I24>; +def : UMad24Pat<V_MAD_U32_U24>; + +def : Pat < + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1) +>; + +def : Pat < + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1) +>; + +defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; +def : ROTRPattern <V_ALIGNBIT_B32>; + +/********** ======================= **********/ +/********** Load/Store Patterns **********/ +/********** ======================= **********/ + +class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst $ptr, (as_i16imm $offset), (i1 0)) +>; + +def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; +def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; +def : DSReadPat <DS_READ_B32, i32, si_load_local>; + +let AddedComplexity = 100 in { + +def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; + +} // End AddedComplexity = 100 + +def : Pat < + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) +>; + +class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; +def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; + +let AddedComplexity = 100 in { + +def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; +} // End AddedComplexity = 100 + +def : Pat < + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (i1 0)) +>; + +class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec +// +// We need to use something for the data0, so we set a register to +// -1. For the non-rtn variants, the manual says it does +// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max +// will always do the increment so I'm assuming it's the same. +// +// We also load this -1 with s_mov_b32 / s_mov_b64 even though this +// needs to be a VGPR. The SGPR copy pass will fix this, and it's +// easier since there is no v_mov_b64. +class DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) +>; + + +class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) +>; + + +// 32-bit atomics. +def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + S_MOV_B32, si_atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + S_MOV_B32, si_atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; + +// 64-bit atomics. +def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + S_MOV_B64, si_atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + S_MOV_B64, si_atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; + + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, + PatFrag constant_ld> { + def : Pat < + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [isSICI] in { +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; +} // End Predicates = [isSICI] + +class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; + +// BUFFER_LOAD_DWORD*, addr64=0 +multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, + MUBUF bothen> { + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, + imm:$offset, 0, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 1, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 0, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, + imm:$offset, 1, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; +} + +defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; +defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, + BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; +defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, + BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; + +class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; + +/* +class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +let Predicates = [isSICI] in { +def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; +} // End Predicates = [isSICI] + +*/ + +//===----------------------------------------------------------------------===// +// MTBUF Patterns +//===----------------------------------------------------------------------===// + +// TBUFFER_STORE_FORMAT_*, addr64=0 +class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat< + (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, + i32:$soffset, imm:$inst_offset, imm:$dfmt, + imm:$nfmt, imm:$offen, imm:$idxen, + imm:$glc, imm:$slc, imm:$tfe), + (opcode + $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), + (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, + (as_i1imm $slc), (as_i1imm $tfe), $soffset) +>; + +def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>; +def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; +def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; +def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; + +let SubtargetPredicate = isCI in { + +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + +// Remaining instructions: +// FLAT_* +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// S_DCACHE_INV_VOL +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 + +} // End isCI + +/********** ====================== **********/ +/********** Indirect adressing **********/ +/********** ====================== **********/ + +multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> { + + // 1. Extract with offset + def : Pat< + (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), + (SI_INDIRECT_SRC $vec, $idx, imm:$off) + >; + + // 2. Extract without offset + def : Pat< + (eltvt (vector_extract vt:$vec, i32:$idx)), + (SI_INDIRECT_SRC $vec, $idx, 0) + >; + + // 3. Insert with offset + def : Pat< + (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (IndDst $vec, $idx, imm:$off, $val) + >; + + // 4. Insert without offset + def : Pat< + (vector_insert vt:$vec, eltvt:$val, i32:$idx), + (IndDst $vec, $idx, 0, $val) + >; +} + +defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>; +defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>; +defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>; +defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>; + +defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>; +defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; +defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; +defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; + +//===----------------------------------------------------------------------===// +// Conversion Patterns +//===----------------------------------------------------------------------===// + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + +// Handle sext_inreg in i64 +def : Pat < + (i64 (sext_inreg i64:$src, i1)), + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i8)), + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i16)), + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 +>; + +class ZExt_i64_i32_Pat <SDNode ext> : Pat < + (i64 (ext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) +>; + +class ZExt_i64_i1_Pat <SDNode ext> : Pat < + (i64 (ext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, + (S_MOV_B32 0), sub1) +>; + + +def : ZExt_i64_i32_Pat<zext>; +def : ZExt_i64_i32_Pat<anyext>; +def : ZExt_i64_i1_Pat<zext>; +def : ZExt_i64_i1_Pat<anyext>; + +def : Pat < + (i64 (sext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, + (S_ASHR_I32 $src, 31), sub1) +>; + +def : Pat < + (i64 (sext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 0, -1, $src), sub0, + (V_CNDMASK_B32_e64 0, -1, $src), sub1) +>; + +// If we need to perform a logical operation on i1 values, we need to +// use vector comparisons since there is only one SCC register. Vector +// comparisions still write to a pair of SGPRs, so treat these as +// 64-bit comparisons. When legalizing SGPR copies, instructions +// resulting in the copies from SCC to these instructions will be +// moved to the VALU. +def : Pat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) +>; + +def : Pat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +>; + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +def : Pat < + (f32 (sint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) +>; + +def : Pat < + (f32 (uint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) +>; + +def : Pat < + (f64 (sint_to_fp i1:$src)), + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) +>; + +def : Pat < + (f64 (uint_to_fp i1:$src)), + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) +>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (trunc i64:$a)), + (EXTRACT_SUBREG $a, sub0) +>; + +def : Pat < + (i1 (trunc i32:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) +>; + +def : Pat < + (i1 (trunc i64:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (EXTRACT_SUBREG $a, sub0)), 1) +>; + +def : Pat < + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) +>; + +def : Pat < + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) +>; + +multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { + def : Pat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : Pat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV 0)) + >; +} + +defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; +// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; + +def : BFEPattern <V_BFE_U32, S_MOV_B32>; + +//===----------------------------------------------------------------------===// +// Fract Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI] in { + +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient +// way to implement it is using V_FRACT_F64. +// The workaround for the V_FRACT bug is: +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + +// Convert (x + (-floor(x)) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) +>; + +// Convert floor(x) to (x - fract(x)) +def : Pat < + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), + (V_ADD_F64 + $mods, + $x, + SRCMODS.NEG, + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isSI] + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + +//============================================================================// +// Miscellaneous Optimization Patterns +//============================================================================// + +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; + +//============================================================================// +// Assembler aliases +//============================================================================// + +def : MnemonicAlias<"v_add_u32", "v_add_i32">; +def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; +def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; + +} // End isGCN predicate diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td new file mode 100644 index 0000000..027a0a2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -0,0 +1,199 @@ +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + + +let TargetPrefix = "SI", isTarget = 1 in { + + def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; + def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; + + // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed + def int_SI_tbuffer_store : Intrinsic < + [], + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + + // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed + def int_SI_buffer_load_dword : Intrinsic < + [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + [IntrReadArgMem]>; + + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_v4i32_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Image instruction without a sampler. + class Image : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Basic sample + def int_SI_image_sample : SampleRaw; + def int_SI_image_sample_cl : SampleRaw; + def int_SI_image_sample_d : SampleRaw; + def int_SI_image_sample_d_cl : SampleRaw; + def int_SI_image_sample_l : SampleRaw; + def int_SI_image_sample_b : SampleRaw; + def int_SI_image_sample_b_cl : SampleRaw; + def int_SI_image_sample_lz : SampleRaw; + def int_SI_image_sample_cd : SampleRaw; + def int_SI_image_sample_cd_cl : SampleRaw; + + // Sample with comparison + def int_SI_image_sample_c : SampleRaw; + def int_SI_image_sample_c_cl : SampleRaw; + def int_SI_image_sample_c_d : SampleRaw; + def int_SI_image_sample_c_d_cl : SampleRaw; + def int_SI_image_sample_c_l : SampleRaw; + def int_SI_image_sample_c_b : SampleRaw; + def int_SI_image_sample_c_b_cl : SampleRaw; + def int_SI_image_sample_c_lz : SampleRaw; + def int_SI_image_sample_c_cd : SampleRaw; + def int_SI_image_sample_c_cd_cl : SampleRaw; + + // Sample with offsets + def int_SI_image_sample_o : SampleRaw; + def int_SI_image_sample_cl_o : SampleRaw; + def int_SI_image_sample_d_o : SampleRaw; + def int_SI_image_sample_d_cl_o : SampleRaw; + def int_SI_image_sample_l_o : SampleRaw; + def int_SI_image_sample_b_o : SampleRaw; + def int_SI_image_sample_b_cl_o : SampleRaw; + def int_SI_image_sample_lz_o : SampleRaw; + def int_SI_image_sample_cd_o : SampleRaw; + def int_SI_image_sample_cd_cl_o : SampleRaw; + + // Sample with comparison and offsets + def int_SI_image_sample_c_o : SampleRaw; + def int_SI_image_sample_c_cl_o : SampleRaw; + def int_SI_image_sample_c_d_o : SampleRaw; + def int_SI_image_sample_c_d_cl_o : SampleRaw; + def int_SI_image_sample_c_l_o : SampleRaw; + def int_SI_image_sample_c_b_o : SampleRaw; + def int_SI_image_sample_c_b_cl_o : SampleRaw; + def int_SI_image_sample_c_lz_o : SampleRaw; + def int_SI_image_sample_c_cd_o : SampleRaw; + def int_SI_image_sample_c_cd_cl_o : SampleRaw; + + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + + // Image instrinsics. + def int_SI_image_load : Image; + def int_SI_image_load_mip : Image; + def int_SI_getresinfo : Image; + + // Deprecated image and sample intrinsics. + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_SI_sample : Sample; + def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; + def int_SI_samplel : Sample; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + + /* Interpolation Intrinsics */ + + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; + + /* Control flow Intrinsics */ + + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp new file mode 100644 index 0000000..9b1d256 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -0,0 +1,421 @@ +//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to fuse DS instructions with close by immediate offsets. +// This will fuse operations such as +// ds_read_b32 v0, v2 offset:16 +// ds_read_b32 v1, v2 offset:32 +// ==> +// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 +// +// +// Future improvements: +// +// - This currently relies on the scheduler to place loads and stores next to +// each other, and then only merges adjacent pairs of instructions. It would +// be good to be more flexible with interleaved instructions, and possibly run +// before scheduling. It currently missing stores of constants because loading +// the constant into the data register is placed between the stores, although +// this is arguably a scheduling problem. +// +// - Live interval recomputing seems inefficient. This currently only matches +// one pair, and recomputes live intervals and moves on to the next pair. It +// would be better to compute a list of all merges that need to occur +// +// - With a list of instructions to process, we can also merge more. If a +// cluster of loads have offsets that are too large to fit in the 8-bit +// offsets, but are close enough to fit in the 8 bits, we can add to the base +// pointer and use the new reduced offsets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-load-store-opt" + +namespace { + +class SILoadStoreOptimizer : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + + static bool offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned EltSize); + + MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize); + + void updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx); + + MachineBasicBlock::iterator mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + + MachineBasicBlock::iterator mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + +public: + static char ID; + + SILoadStoreOptimizer() + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), + LIS(nullptr) {} + + SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + } + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Load / Store Optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<LiveIntervals>(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) + +char SILoadStoreOptimizer::ID = 0; + +char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; + +FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { + return new SILoadStoreOptimizer(TM); +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned Size) { + // XXX - Would the same offset be OK? Is there any reason this would happen or + // be useful? + if (Offset0 == Offset1) + return false; + + // This won't be valid if the offset isn't aligned. + if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + return false; + + unsigned EltOffset0 = Offset0 / Size; + unsigned EltOffset1 = Offset1 / Size; + + // Check if the new offsets fit in the reduced 8-bit range. + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + return true; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) + return false; + + return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize){ + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + ++MBBI; + + if (MBBI->getOpcode() != I->getOpcode()) + return E; + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) + return MBBI; + } + + return E; +} + +void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), + E = MRI->reg_end(); I != E; ) { + MachineOperand &O = *I; + ++I; + O.substVirtReg(DstReg, SubIdx, *TRI); + } +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be careful, since the addresses could be subregisters themselves in weird + // cases, like vectors of pointers. + const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + + unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); + unsigned DestReg1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Read2Desc = TII->get(Opc); + + const TargetRegisterClass *SuperRC + = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + + DebugLoc DL = I->getDebugLoc(); + MachineInstrBuilder Read2 + = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + .addOperand(*AddrReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); + updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); + + LIS->RemoveMachineInstrFromMaps(I); + // Replacing Paired in the maps with Read2 allows us to avoid updating the + // live range for the m0 register. + LIS->ReplaceMachineInstrInMaps(Paired, Read2); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); + LIS->shrinkToUses(&AddrRegLI); + + LIS->getInterval(DestReg); // Create new LI + + DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + return Read2.getInstr(); +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be sure to use .addOperand(), and not .addReg() with these. We want to be + // sure we preserve the subregister index and any register flags set on them. + const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Data1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Write2Desc = TII->get(Opc); + DebugLoc DL = I->getDebugLoc(); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + MachineInstrBuilder Write2 + = BuildMI(*MBB, I, DL, Write2Desc) + .addOperand(*Addr) // addr + .addOperand(*Data0) // data0 + .addOperand(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + // XXX - How do we express subregisters here? + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + // This doesn't handle physical registers like M0 + LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + + if (UpdateM0Range) { + SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + M0Segment->end = Write2Index.getRegSlot(); + } + + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + return Write2.getInstr(); +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + + // Don't combine if volatile. + if (MI.hasOrderedMemoryRef()) { + ++I; + continue; + } + + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeRead2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeWrite2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } + + ++I; + } + + return Modified; +} + +bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &STM = MF.getSubtarget(); + TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo()); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + + assert(!MRI->isSSA()); + + bool Modified = false; + + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + + return Modified; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp new file mode 100644 index 0000000..c319b32 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -0,0 +1,605 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. +/// +/// All control flow is handled using predicated instructions and +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +/// by writting to the 64-bit EXEC register (each bit corresponds to a +/// single vector ALU). Typically, for predicates, a vector ALU will write +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +/// Vector ALU) and then the ScalarALU will AND the VCC register with the +/// EXEC to update the predicates. +/// +/// For example: +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +/// %SGPR0 = SI_IF %VCC +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +/// %SGPR0 = SI_ELSE %SGPR0 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +/// SI_END_CF %SGPR0 +/// +/// becomes: +/// +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_CBRANCH_EXECZ label0 // This instruction is an optional +/// // optimization which allows us to +/// // branch if all the bits of +/// // EXEC are zero. +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// +/// label0: +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// // instruction again. +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// label1: +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static const unsigned SkipThreshold = 12; + + static char ID; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); + + void Skip(MachineInstr &From, MachineOperand &To); + void SkipIfDead(MachineInstr &MI); + + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); + + void Kill(MachineInstr &MI); + void Branch(MachineInstr &MI); + + void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); + void IndirectSrc(MachineInstr &MI); + void IndirectDst(MachineInstr &MI); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower control flow instructions"; + } + +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + + unsigned NumInstr = 0; + + for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); + MBB = *MBB->succ_begin()) { + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + NumInstr < SkipThreshold && I != E; ++I) { + + if (I->isBundle() || !I->isBundled()) + if (++NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { + + if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) + return; + + DebugLoc DL = From.getDebugLoc(); + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addOperand(To) + .addReg(AMDGPU::EXEC); +} + +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != + ShaderType::PIXEL || + !shouldSkip(&MBB, &MBB.getParent()->back())) + return; + + MachineBasicBlock::iterator Insert = &MI; + ++Insert; + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0); + + // ... and terminate wavefront + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); +} + +void SILowerControlFlowPass::If(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)) + .addReg(AMDGPU::EXEC); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Branch(MachineInstr &MI) { + if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) + MI.eraseFromParent(); + + // If these aren't equal, this is probably an infinite loop. +} + +void SILowerControlFlowPass::Kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif + + // Clear this thread from the exec mask if the operand is negative + if ((Op.isImm())) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.getImm() & 0x80000000) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + .addImm(0) + .addOperand(Op); + } + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I = MI; + + unsigned Save = MI.getOperand(1).getReg(); + unsigned Idx = MI.getOperand(3).getReg(); + + if (AMDGPU::SReg_32RegClass.contains(Idx)) { + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx) + .addImm(Offset); + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + } + MBB.insert(I, MovRel); + } else { + + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); + + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + // Do the actual move + MBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); + + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + + } + MI.eraseFromParent(); +} + +/// \param @VecReg The register which holds element zero of the vector +/// being addressed into. +/// \param[out] @Reg The base register to use in the indirect addressing instruction. +/// \param[in,out] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] +// As an output, this is a constant value that needs +// to be added to the value stored in M0. +void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, + unsigned &Reg, + int &Offset) { + unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); + if (!SubReg) + SubReg = VecReg; + + const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); + int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + + if (RegIdx < 0) { + Offset = RegIdx; + RegIdx = 0; + } else { + Offset = 0; + } + + Reg = RC->getRegister(RegIdx); +} + +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vec = MI.getOperand(2).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Reg; + + computeIndirectRegAndOffset(Vec, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(Reg) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Vec, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Val = MI.getOperand(5).getReg(); + unsigned Reg; + + computeIndirectRegAndOffset(Dst, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) + .addReg(Reg, RegState::Define) + .addReg(Val) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Dst, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + bool HaveKill = false; + bool NeedWQM = false; + bool NeedFlat = false; + unsigned Depth = 0; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + + MachineInstr &MI = *I; + if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) + NeedWQM = true; + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) + NeedFlat = true; + + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF: + ++Depth; + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + ++Depth; + Loop(MI); + break; + + case AMDGPU::SI_END_CF: + if (--Depth == 0 && HaveKill) { + SkipIfDead(MI); + HaveKill = false; + } + EndCf(MI); + break; + + case AMDGPU::SI_KILL: + if (Depth == 0) + SkipIfDead(MI); + else + HaveKill = true; + Kill(MI); + break; + + case AMDGPU::S_BRANCH: + Branch(MI); + break; + + case AMDGPU::SI_INDIRECT_SRC: + IndirectSrc(MI); + break; + + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + IndirectDst(MI); + break; + } + } + } + + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + } + + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp new file mode 100644 index 0000000..67421e2 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -0,0 +1,151 @@ +//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// i1 values are usually inserted by the CFG Structurize pass and they are +/// unique in that they can be copied from VALU to SALU registers. +/// This is not possible for any other value type. Since there are no +/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. +/// +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "si-i1-copies" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SILowerI1Copies : public MachineFunctionPass { +public: + static char ID; + +public: + SILowerI1Copies() : MachineFunctionPass(ID) { + initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower i1 Copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) + +char SILowerI1Copies::ID = 0; + +char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; + +FunctionPass *llvm::createSILowerI1CopiesPass() { + return new SILowerI1Copies(); +} + +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + if (RC == &AMDGPU::VReg_1RegClass) + MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); + continue; + } + + if (MI.getOpcode() != AMDGPU::COPY) + continue; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); + + if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; + + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + + if (DstRC == &AMDGPU::VReg_1RegClass && + TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { + I1Defs.push_back(Dst.getReg()); + DebugLoc DL = MI.getDebugLoc(); + + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); + if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { + if (DefInst->getOperand(1).isImm()) { + I1Defs.push_back(Dst.getReg()); + + int64_t Val = DefInst->getOperand(1).getImm(); + assert(Val == 0 || Val == -1); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) + .addOperand(Dst) + .addImm(Val); + MI.eraseFromParent(); + continue; + } + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(Dst) + .addImm(0) + .addImm(-1) + .addOperand(Src); + MI.eraseFromParent(); + } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && + SrcRC == &AMDGPU::VReg_1RegClass) { + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) + .addOperand(Dst) + .addOperand(Src) + .addImm(0); + MI.eraseFromParent(); + } + } + } + + for (unsigned Reg : I1Defs) + MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp new file mode 100644 index 0000000..587ea63 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -0,0 +1,77 @@ +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + + +#include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +#define MAX_LANES 64 + +using namespace llvm; + + +// Pin the vtable to this file. +void SIMachineFunctionInfo::anchor() {} + +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), + HasSpilledVGPRs(false), + PSInputAddr(0), + NumUserSGPRs(0), + LDSWaveSpillSize(0) { } + +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( + MachineFunction *MF, + unsigned FrameIndex, + unsigned SubIdx) { + const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + Offset += SubIdx * 4; + + unsigned LaneVGPRIdx = Offset / (64 * 4); + unsigned Lane = (Offset / 4) % 64; + + struct SpilledReg Spill; + + if (!LaneVGPRs.count(LaneVGPRIdx)) { + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); + + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); + BI != BE; ++BI) { + BI->addLiveIn(LaneVGPR); + } + } + + Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; + Spill.Lane = Lane; + return Spill; +} + +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h new file mode 100644 index 0000000..667da4c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -0,0 +1,66 @@ +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" +#include <map> + +namespace llvm { + +class MachineRegisterInfo; + +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which +/// tells the hardware which interpolation parameters to load. +class SIMachineFunctionInfo : public AMDGPUMachineFunction { + void anchor() override; + + unsigned TIDReg; + bool HasSpilledVGPRs; + +public: + + struct SpilledReg { + unsigned VGPR; + int Lane; + SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } + SpilledReg() : VGPR(0), Lane(-1) { } + bool hasLane() { return Lane != -1;} + }; + + // SIMachineFunctionInfo definition + + SIMachineFunctionInfo(const MachineFunction &MF); + SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, + unsigned SubIdx); + unsigned PSInputAddr; + unsigned NumUserSGPRs; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned LDSWaveSpillSize; + unsigned ScratchOffsetReg; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } + void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; +}; + +} // End namespace llvm + + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp new file mode 100644 index 0000000..0a7f684 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -0,0 +1,194 @@ +//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This pass loads scratch pointer and scratch offset into a register or a +/// frame index which can be used anywhere in the program. These values will +/// be used for spilling VGPRs. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +namespace { + +class SIPrepareScratchRegs : public MachineFunctionPass { + +private: + static char ID; + +public: + SIPrepareScratchRegs() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI prepare scratch registers"; + } + +}; + +} // End anonymous namespace + +char SIPrepareScratchRegs::ID = 0; + +FunctionPass *llvm::createSIPrepareScratchRegs() { + return new SIPrepareScratchRegs(); +} + +bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock::iterator I = Entry->begin(); + DebugLoc DL = I->getDebugLoc(); + + // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to + // run this pass. + if (!MFI->hasSpilledVGPRs()) + return false; + + unsigned ScratchPtrPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + if (!Entry->isLiveIn(ScratchPtrPreloadReg)) + Entry->addLiveIn(ScratchPtrPreloadReg); + + if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) + Entry->addLiveIn(ScratchOffsetPreloadReg); + + // Load the scratch offset. + unsigned ScratchOffsetReg = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); + int ScratchOffsetFI = -1; + + if (ScratchOffsetReg != AMDGPU::NoRegister) { + // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) + .addReg(ScratchOffsetPreloadReg); + } else { + // No SGPR is available, we must spill. + ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) + .addReg(ScratchOffsetPreloadReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } + + + // Now that we have the scratch pointer and offset values, we need to + // add them to all the SI_SPILL_V* instructions. + + RegScavenger RS; + unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); + RS.addScavengingFrameIndex(ScratchRsrcFI); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + // Add the scratch offset reg as a live-in so that the register scavenger + // doesn't re-use it. + if (!MBB.isLiveIn(ScratchOffsetReg) && + ScratchOffsetReg != AMDGPU::NoRegister) + MBB.addLiveIn(ScratchOffsetReg); + RS.enterBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + RS.forward(I); + DebugLoc DL = MI.getDebugLoc(); + if (!TII->isVGPRSpill(MI.getOpcode())) + continue; + + // Scratch resource + unsigned ScratchRsrcReg = + RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // Scratch Offset + if (ScratchOffsetReg == AMDGPU::NoRegister) { + ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), + ScratchOffsetReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else if (!MBB.isLiveIn(ScratchOffsetReg)) { + MBB.addLiveIn(ScratchOffsetReg); + } + + if (ScratchRsrcReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("ran out of SGPRs for spilling VGPRs"); + ScratchRsrcReg = AMDGPU::SGPR0; + ScratchOffsetReg = AMDGPU::SGPR0; + } + MI.getOperand(2).setReg(ScratchRsrcReg); + MI.getOperand(2).setIsKill(true); + MI.getOperand(2).setIsUndef(false); + MI.getOperand(3).setReg(ScratchOffsetReg); + MI.getOperand(3).setIsUndef(false); + MI.getOperand(3).setIsKill(false); + MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); + } + } + return true; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp new file mode 100644 index 0000000..db2ff0b --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -0,0 +1,543 @@ +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + + +#include "SIRegisterInfo.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} + +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(AMDGPU::EXEC); + + // EXEC_LO and EXEC_HI could be allocated and used as regular register, + // but this seems likely to result in bugs, so I'm marking them as reserved. + Reserved.set(AMDGPU::EXEC_LO); + Reserved.set(AMDGPU::EXEC_HI); + + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + Reserved.set(AMDGPU::FLAT_SCR); + Reserved.set(AMDGPU::FLAT_SCR_LO); + Reserved.set(AMDGPU::FLAT_SCR_HI); + + // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs + Reserved.set(AMDGPU::VGPR255); + Reserved.set(AMDGPU::VGPR254); + + // Tonga and Iceland can only allocate a fixed number of SGPRs due + // to a hw bug. + if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). + // Assume XNACK_MASK is unused. + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + + for (unsigned i = Limit; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); + } + } + + return Reserved; +} + +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + + const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should adjust the max number of waves based on LDS size. + unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), + STI.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; +} + +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + unsigned Value, + unsigned ScratchRsrcReg, + unsigned ScratchOffset, + int64_t Offset, + RegScavenger *RS) const { + + MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MI->getParent()->getParent(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); + LLVMContext &Ctx = MF->getFunction()->getContext(); + DebugLoc DL = MI->getDebugLoc(); + bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + + bool RanOutOfSGPRs = false; + unsigned SOffset = ScratchOffset; + + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned Size = NumSubRegs * 4; + + if (!isUInt<12>(Offset + Size)) { + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + if (SOffset == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + SOffset = AMDGPU::SGPR0; + } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffset) + .addImm(Offset); + Offset = 0; + } + + if (RanOutOfSGPRs) + Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : + Value; + bool IsKill = (i == e - 1); + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + } +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + + switch (MI->getOpcode()) { + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg) + .addImm(Spill.Lane); + + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + + // TODO: only do this when it is needed + switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI + TII->insertNOPs(MI, 3); + break; + case AMDGPUSubtarget::SEA_ISLANDS: + break; + default: // VOLCANIC_ISLANDS and later + // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI + // and later. This also applies to VALUs which write VCC, but we're + // unlikely to see VMEM use VCC. + TII->insertNOPs(MI, 4); + } + + MI->eraseFromParent(); + break; + } + + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V96_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + } + + default: { + int64_t Offset = FrameInfo->getObjectOffset(Index); + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false, false, true); + } + } + } +} + +const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::VGPR_32RegClass; + } +} + +unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { + return getEncodingValue(Reg) & 0xff; +} + +const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + static const TargetRegisterClass *BaseClasses[] = { + &AMDGPU::VGPR_32RegClass, + &AMDGPU::SReg_32RegClass, + &AMDGPU::VReg_64RegClass, + &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, + &AMDGPU::SReg_128RegClass, + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass + }; + + for (const TargetRegisterClass *BaseClass : BaseClasses) { + if (BaseClass->contains(Reg)) { + return BaseClass; + } + } + return nullptr; +} + +bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const { + if (hasVGPRs(SRC)) { + return SRC; + } else if (SRC == &AMDGPU::SCCRegRegClass) { + return &AMDGPU::VCCRegRegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { + return &AMDGPU::VGPR_32RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { + return &AMDGPU::VReg_64RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { + return &AMDGPU::VReg_128RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { + return &AMDGPU::VReg_256RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { + return &AMDGPU::VReg_512RegClass; + } + return nullptr; +} + +const TargetRegisterClass *SIRegisterInfo::getSubRegClass( + const TargetRegisterClass *RC, unsigned SubIdx) const { + if (SubIdx == AMDGPU::NoSubRegister) + return RC; + + // If this register has a sub-register, we can safely assume it is a 32-bit + // register, because all of SI's sub-registers are 32-bit. + if (isSGPRClass(RC)) { + return &AMDGPU::SGPR_32RegClass; + } else { + return &AMDGPU::VGPR_32RegClass; + } +} + +unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, + const TargetRegisterClass *SubRC, + unsigned Channel) const { + + switch (Reg) { + case AMDGPU::VCC: + switch(Channel) { + case 0: return AMDGPU::VCC_LO; + case 1: return AMDGPU::VCC_HI; + default: llvm_unreachable("Invalid SubIdx for VCC"); + } + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; + } + + const TargetRegisterClass *RC = getPhysRegClass(Reg); + // 32-bit registers don't have sub-registers, so we can just return the + // Reg. We need to have this check here, because the calculation below + // using getHWRegIndex() will fail with special 32-bit registers like + // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. + if (RC->getSize() == 4) { + assert(Channel == 0); + return Reg; + } + + unsigned Index = getHWRegIndex(Reg); + return SubRC->getRegister(Index + Channel); +} + +bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { + return OpType == AMDGPU::OPERAND_REG_IMM32; +} + +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (opCanUseLiteralConstant(OpType)) + return true; + + return OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + switch (Value) { + case SIRegisterInfo::TGID_X: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + case SIRegisterInfo::TGID_Y: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + case SIRegisterInfo::TGID_Z: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + if (MFI->getShaderType() != ShaderType::COMPUTE) + return MFI->ScratchOffsetReg; + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + case SIRegisterInfo::SCRATCH_PTR: + return AMDGPU::SGPR2_SGPR3; + case SIRegisterInfo::INPUT_PTR: + return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::TIDIG_X: + return AMDGPU::VGPR0; + case SIRegisterInfo::TIDIG_Y: + return AMDGPU::VGPR1; + case SIRegisterInfo::TIDIG_Z: + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected preloaded value type"); +} + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const { + + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); + I != E; ++I) { + if (!MRI.isPhysRegUsed(*I)) + return *I; + } + return AMDGPU::NoRegister; +} + +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const { + if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WaveCount) { + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return 102; + } + } else { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h new file mode 100644 index 0000000..bfdb67c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -0,0 +1,131 @@ +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIRegisterInfo +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Support/Debug.h" + +namespace llvm { + +struct SIRegisterInfo : public AMDGPURegisterInfo { + + SIRegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; + + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief Return the 'base' register class for this register. + /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. + const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; + + /// \returns true if this class contains only SGPR registers + bool isSGPRClass(const TargetRegisterClass *RC) const { + if (!RC) + return false; + + return !hasVGPRs(RC); + } + + /// \returns true if this class ID contains only SGPR registers + bool isSGPRClassID(unsigned RCID) const { + if (static_cast<int>(RCID) == -1) + return false; + + return isSGPRClass(getRegClass(RCID)); + } + + /// \returns true if this class contains VGPR registers. + bool hasVGPRs(const TargetRegisterClass *RC) const; + + /// \returns A VGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const; + + /// \returns The register class that is used for a sub-register of \p RC for + /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will + /// be returned. + const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, + unsigned SubIdx) const; + + /// \p Channel This is the register channel (e.g. a value from 0-16), not the + /// SubReg index. + /// \returns The sub-register of Reg that is in Channel. + unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, + unsigned Channel) const; + + /// \returns True if operands defined with this operand type can accept + /// a literal constant (i.e. any 32-bit immediate). + bool opCanUseLiteralConstant(unsigned OpType) const; + + /// \returns True if operands defined with this operand type can accept + /// an inline constant. i.e. An integer value in the range (-16, 64) or + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + bool opCanUseInlineConstant(unsigned OpType) const; + + enum PreloadedValue { + TGID_X, + TGID_Y, + TGID_Z, + SCRATCH_WAVE_OFFSET, + SCRATCH_PTR, + INPUT_PTR, + TIDIG_X, + TIDIG_Y, + TIDIG_Z + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const; + + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const; + +private: + void buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, unsigned Value, + unsigned ScratchRsrcReg, unsigned ScratchOffset, + int64_t Offset, RegScavenger *RS) const; +}; + +} // End namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td new file mode 100644 index 0000000..2a9017f --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -0,0 +1,284 @@ +//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the SI registers +//===----------------------------------------------------------------------===// + +class SIReg <string n, bits<16> encoding = 0> : Register<n> { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +// Special Registers +def VCC_LO : SIReg<"vcc_lo", 106>; +def VCC_HI : SIReg<"vcc_hi", 107>; + +// VCC for 64-bit instructions +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 106; +} + +def EXEC_LO : SIReg<"exec_lo", 126>; +def EXEC_HI : SIReg<"exec_hi", 127>; + +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 126; +} + +def SCC : SIReg<"scc", 253>; +def M0 : SIReg <"m0", 124>; + +def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. +def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. + +// Pair to indicate location of scratch space for flat accesses. +def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + +// SGPR registers +foreach Index = 0-101 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index>; +} + +// VGPR registers +foreach Index = 0-255 in { + def VGPR#Index : SIReg <"VGPR"#Index, Index> { + let HWEncoding{8} = 1; + } +} + +//===----------------------------------------------------------------------===// +// Groupings using register classes and tuples +//===----------------------------------------------------------------------===// + +// SGPR 32-bit registers +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "SGPR%u", 0, 101))>; + +// SGPR 64-bit registers +def SGPR_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate (trunc SGPR_32, 101), 2)), + (add (decimate (shl SGPR_32, 1), 2))]>; + +// SGPR 128-bit registers +def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate (trunc SGPR_32, 99), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4))]>; + +// SGPR 256-bit registers +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (decimate (trunc SGPR_32, 95), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4))]>; + +// SGPR 512-bit registers +def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (decimate (trunc SGPR_32, 87), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4))]>; + +// VGPR 32-bit registers +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "VGPR%u", 0, 255))>; + +// VGPR 64-bit registers +def VGPR_64 : RegisterTuples<[sub0, sub1], + [(add (trunc VGPR_32, 255)), + (add (shl VGPR_32, 1))]>; + +// VGPR 96-bit registers +def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], + [(add (trunc VGPR_32, 254)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2))]>; + +// VGPR 128-bit registers +def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (trunc VGPR_32, 253)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3))]>; + +// VGPR 256-bit registers +def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (trunc VGPR_32, 249)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7))]>; + +// VGPR 512-bit registers +def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (trunc VGPR_32, 241)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15))]>; + +//===----------------------------------------------------------------------===// +// Register classes used as source and destination +//===----------------------------------------------------------------------===// + +class RegImmMatcher<string name> : AsmOperandClass { + let Name = name; + let RenderMethod = "addRegOrImmOperands"; +} + +// Special register classes for predicates and the M0 register +def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { + let CopyCost = -1; // Theoretically it is possible to read from SCC, + // but it should never be necessary. +} + +def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; +def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; + +// Register class for all scalar registers (SGPRs + Special Registers) +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) +>; + +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; + +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, + (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) +>; + +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; + +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; + +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; + +// Register class for all vector registers (VGPRs + Interploation Registers) +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; + +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { + let Size = 96; +} + +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; + +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; + +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; + +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 32; +} + +class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; +} + +class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; +} + +//===----------------------------------------------------------------------===// +// SSrc_* Operands with an SGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def SSrc_32 : RegImmOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SSrc32">; +} + +def SSrc_64 : RegImmOperand<SReg_64> { + let ParserMatchClass = RegImmMatcher<"SSrc64">; +} + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or a inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_32 : RegInlineOperand<SReg_32> { + let ParserMatchClass = RegImmMatcher<"SCSrc32">; +} + +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; + +def VSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc32">; +} + +def VSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc64">; +} + +//===----------------------------------------------------------------------===// +// VCSrc_* Operands with an SGPR, VGPR or an inline constant +//===----------------------------------------------------------------------===// + +def VCSrc_32 : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc32">; +} + +def VCSrc_64 : RegisterOperand<VS_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc64">; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td new file mode 100644 index 0000000..9b1f676 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -0,0 +1,91 @@ +//===-- SISchedule.td - SI Scheduling definitons -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MachineModel definitions for Southern Islands (SI) +// +//===----------------------------------------------------------------------===// + +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; + +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteQuarterRate32 : SchedWrite; + +def WriteFloatFMA : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; +def SIQuarterSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, + int latency> : WriteRes<write, resources> { + let Latency = latency; +} + +class HWVALUWriteRes<SchedWrite write, int latency> : + HWWriteRes<write, [HWVALU], latency>; + + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +// The latency values are 1 / (operations / cycle) / 4. +multiclass SICommonWriteRes { + + def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 + def : HWWriteRes<WriteSALU, [HWSALU], 1>; + def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? + def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + + def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<WriteQuarterRate32, 4>; +} + + +let SchedModel = SIFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 4>; +def : HWVALUWriteRes<WriteDoubleAdd, 2>; + +} // End SchedModel = SIFullSpeedModel + +let SchedModel = SIQuarterSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 16>; +def : HWVALUWriteRes<WriteDouble, 16>; +def : HWVALUWriteRes<WriteDoubleAdd, 8>; + +} // End SchedModel = SIQuarterSpeedModel diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp new file mode 100644 index 0000000..51e72cd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -0,0 +1,272 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUMCInstLower.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); +STATISTIC(NumLiteralConstantsFolded, + "Number of literal constants folded into 32-bit instructions."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrining + // post-regalloc. + if (Src2) + return false; + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + return false; + + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) + return false; + + // Check output modifiers + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + return true; +} + +/// \brief This function checks \p MI for operands defined by a move immediate +/// instruction and then folds the literal constant into the instruction if it +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction +/// and will only fold literal constants if we are still in SSA. +static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, + MachineRegisterInfo &MRI, bool TryToCommute = true) { + + if (!MRI.isSSA()) + return; + + assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || + TII->isVOPC(MI.getOpcode())); + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + + // Only one literal constant is allowed per instruction, so if src0 is a + // literal constant then we can't do any folding. + if (Src0.isImm() && + TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) + return; + + // Literal constants and SGPRs can only be used in Src0, so if Src0 is an + // SGPR, we cannot commute the instruction, so we can't fold any literal + // constants. + if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) + return; + + // Try to fold Src0 + if (Src0.isReg()) { + unsigned Reg = Src0.getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } + if (ConstantFolded) { + if (MRI.use_empty(Reg)) + Def->eraseFromParent(); + ++NumLiteralConstantsFolded; + return; + } + } + } + + // We have failed to fold src0, so commute the instruction and try again. + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + foldImmediates(MI, TII, MRI, false); + +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + if (Src.isImm()) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + } + + continue; + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commuting the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + // getVOPe32 could be -1 here if we started with an instruction that had + // a 32-bit encoding and then commuted it to an instruction that did not. + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because the register allocator has + // trouble with sequences like this, which cause the allocator to run + // out of registers if vreg0 and vreg1 belong to the VCCReg register + // class: + // vreg0 = VOPC; + // vreg1 = VOPC; + // S_AND_B64 vreg0, vreg1 + // + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we we will run + // this pass again after RA and shrink it if it outputs to VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + + MachineInstrBuilder Inst32 = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // dst + Inst32.addOperand(MI.getOperand(0)); + + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.addOperand(*Src1); + + ++NumInstructionsShrunk; + MI.eraseFromParent(); + + foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + + + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp new file mode 100644 index 0000000..591ce85 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -0,0 +1,161 @@ +//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass removes performs the following type substitution on all +/// non-compute shaders: +/// +/// v16i8 => i128 +/// - v16i8 is used for constant memory resource descriptors. This type is +/// legal for some compute APIs, and we don't want to declare it as legal +/// in the backend, because we want the legalizer to expand all v16i8 +/// operations. +/// v1* => * +/// - Having v1* types complicates the legalizer and we can easily replace +/// - them with the element type. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { + +class SITypeRewriter : public FunctionPass, + public InstVisitor<SITypeRewriter> { + + static char ID; + Module *Mod; + Type *v16i8; + Type *v4i32; + +public: + SITypeRewriter() : FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { + return "SI Type Rewriter"; + } + void visitLoadInst(LoadInst &I); + void visitCallInst(CallInst &I); + void visitBitCast(BitCastInst &I); +}; + +} // End anonymous namespace + +char SITypeRewriter::ID = 0; + +bool SITypeRewriter::doInitialization(Module &M) { + Mod = &M; + v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); + v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); + return false; +} + +bool SITypeRewriter::runOnFunction(Function &F) { + Attribute A = F.getFnAttribute("ShaderType"); + + unsigned ShaderType = ShaderType::COMPUTE; + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + Str.getAsInteger(0, ShaderType); + } + if (ShaderType == ShaderType::COMPUTE) + return false; + + visit(F); + visit(F); + + return false; +} + +void SITypeRewriter::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + Type *PtrTy = Ptr->getType(); + Type *ElemTy = PtrTy->getPointerElementType(); + IRBuilder<> Builder(&I); + if (ElemTy == v16i8) { + Value *BitCast = Builder.CreateBitCast(Ptr, + PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); + LoadInst *Load = Builder.CreateLoad(BitCast); + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + I.getAllMetadataOtherThanDebugLoc(MD); + for (unsigned i = 0, e = MD.size(); i != e; ++i) { + Load->setMetadata(MD[i].first, MD[i].second); + } + Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); + I.replaceAllUsesWith(BitCastLoad); + I.eraseFromParent(); + } +} + +void SITypeRewriter::visitCallInst(CallInst &I) { + IRBuilder<> Builder(&I); + + SmallVector <Value*, 8> Args; + SmallVector <Type*, 8> Types; + bool NeedToReplace = false; + Function *F = I.getCalledFunction(); + std::string Name = F->getName(); + for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { + Value *Arg = I.getArgOperand(i); + if (Arg->getType() == v16i8) { + Args.push_back(Builder.CreateBitCast(Arg, v4i32)); + Types.push_back(v4i32); + NeedToReplace = true; + Name = Name + ".v4i32"; + } else if (Arg->getType()->isVectorTy() && + Arg->getType()->getVectorNumElements() == 1 && + Arg->getType()->getVectorElementType() == + Type::getInt32Ty(I.getContext())){ + Type *ElementTy = Arg->getType()->getVectorElementType(); + std::string TypeName = "i32"; + InsertElementInst *Def = cast<InsertElementInst>(Arg); + Args.push_back(Def->getOperand(1)); + Types.push_back(ElementTy); + std::string VecTypeName = "v1" + TypeName; + Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); + NeedToReplace = true; + } else { + Args.push_back(Arg); + Types.push_back(Arg->getType()); + } + } + + if (!NeedToReplace) { + return; + } + Function *NewF = Mod->getFunction(Name); + if (!NewF) { + NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); + NewF->setAttributes(F->getAttributes()); + } + I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); + I.eraseFromParent(); +} + +void SITypeRewriter::visitBitCast(BitCastInst &I) { + IRBuilder<> Builder(&I); + if (I.getDestTy() != v4i32) { + return; + } + + if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { + if (Op->getSrcTy() == v4i32) { + I.replaceAllUsesWith(Op->getOperand(0)); + I.eraseFromParent(); + } + } +} + +FunctionPass *llvm::createSITypeRewriter() { + return new SITypeRewriter(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp new file mode 100644 index 0000000..2112135 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -0,0 +1,30 @@ +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +/// \brief The target which suports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. +Target llvm::TheAMDGPUTarget; +/// \brief The target for GCN GPUs +Target llvm::TheGCNTarget; + +/// \brief Extern function to initialize the targets for the AMDGPU backend +extern "C" void LLVMInitializeAMDGPUTargetInfo() { + RegisterTarget<Triple::r600, false> + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td new file mode 100644 index 0000000..d8738f9 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td @@ -0,0 +1,166 @@ +//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// VI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class DSe_vi <bits<8> op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{16} = gds; + let Inst{24-17} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe_vi <bits<7> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{16} = lds; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe_vi <bits<4> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{18-15} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class SMEMe_vi <bits<8> op, bit imm> : Enc64 { + bits<7> sbase; + bits<7> sdata; + bits<1> glc; + bits<20> offset; + + let Inst{5-0} = sbase{6-1}; + let Inst{12-6} = sdata; + let Inst{16} = glc; + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = offset; +} + +class VOP3e_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class EXPe_vi : EXPe { + let Inst{31-26} = 0x31; //encoding +} + +class VINTRPe_vi <bits<2> op> : VINTRPe <op> { + let Inst{31-26} = 0x35; // encoding +} diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td new file mode 100644 index 0000000..5bf86e6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -0,0 +1,106 @@ +//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + +let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>; +defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>; +defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>; +defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>; +defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>; +defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>; +defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>; +defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>; +defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>; +defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16", + VOP_F16_F16 +>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16", + VOP_I16_F16 +>; +defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>; +defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>; +defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>; +defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>; +defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>; +defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>; +defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>; + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { + +defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>; +defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>; +defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16, + null_frag, "v_sub_f16" +>; +defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>; +defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>; +} // End isCommutable = 1 +defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">; +let isCommutable = 1 in { +defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">; +defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>; +defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>; +defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>; +defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>; +defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>; +let isCommutable = 1 in { +defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>; +defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>; +defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>; +defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>; +defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; + +// Aliases to simplify matching of floating-pint instructions that are VOP2 on +// SI and VOP3 on VI. + +class SI2_VI3Alias <string name, Instruction inst> : InstAlias < + name#" $dst, $src0, $src1", + (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) +>, PredicateControl { + let UseInstAsmMatchConverter = 0; +} + +def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; + +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// SMEM Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +} // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h index 9550a3a..d554fe5 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -46,6 +46,6 @@ FunctionPass *createThumb2SizeReductionPass( void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index d84f296..4530e41 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -429,7 +429,7 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, } void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); // Use unified assembler syntax. OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified); @@ -473,7 +473,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { // All darwin targets use mach-o. const TargetLoweringObjectFileMachO &TLOFMacho = @@ -564,7 +564,7 @@ void ARMAsmPrinter::emitAttributes() { // anyhow. // FIXME: For ifunc related functions we could iterate over and look // for a feature string that doesn't match the default one. - StringRef TT = TM.getTargetTriple(); + const Triple &TT = TM.getTargetTriple(); StringRef CPU = TM.getTargetCPU(); StringRef FS = TM.getTargetFeatureString(); std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h index a6bc368..3d25121 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -105,7 +105,7 @@ private: public: unsigned getISAEncoding() override { // ARM/Darwin adds ISA to the DWARF info for each function. - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (!TT.isOSBinFormatMachO()) return 0; bool isThumb = TT.getArch() == Triple::thumb || diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9c4b496..f2b7a64 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -28,6 +28,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -396,7 +397,7 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { unsigned ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); int BOpc = !AFI->isThumbFunction() @@ -458,8 +459,7 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { } bool ARMBaseInstrInfo:: -PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const { +PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const { unsigned Opc = MI->getOpcode(); if (isUncondBranchOpcode(Opc)) { MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); @@ -479,9 +479,8 @@ PredicateInstruction(MachineInstr *MI, return false; } -bool ARMBaseInstrInfo:: -SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const { +bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { if (Pred1.size() > 2 || Pred2.size() > 2) return false; @@ -595,7 +594,7 @@ template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) { // all definitions of CPSR are dead return true; } -} +} // namespace llvm /// GetInstSize - Return the size of the specified MachineInstr. /// @@ -3995,7 +3994,7 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, } bool ARMBaseInstrInfo:: -hasHighOperandLatency(const InstrItineraryData *ItinData, +hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const { @@ -4007,9 +4006,8 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, return true; // Hoist VFP / NEON instructions with 4 or higher latency. - int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); - if (Latency < 0) - Latency = getInstrLatency(ItinData, DefMI); + unsigned Latency + = SchedModel.computeOperandLatency(DefMI, DefIdx, UseMI, UseIdx); if (Latency <= 3) return false; return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || @@ -4017,8 +4015,9 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, } bool ARMBaseInstrInfo:: -hasLowDefLatency(const InstrItineraryData *ItinData, +hasLowDefLatency(const TargetSchedModel &SchedModel, const MachineInstr *DefMI, unsigned DefIdx) const { + const InstrItineraryData *ItinData = SchedModel.getInstrItineraries(); if (!ItinData || ItinData->isEmpty()) return false; diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index c7185fe..6fc0edd 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -116,8 +116,7 @@ public: bool AllowModify = false) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; bool @@ -133,10 +132,10 @@ public: } bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const override; + ArrayRef<MachineOperand> Pred) const override; - bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const override; + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; @@ -328,12 +327,12 @@ private: int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override; - bool hasHighOperandLatency(const InstrItineraryData *ItinData, + bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; - bool hasLowDefLatency(const InstrItineraryData *ItinData, + bool hasLowDefLatency(const TargetSchedModel &SchedModel, const MachineInstr *DefMI, unsigned DefIdx) const override; @@ -494,6 +493,6 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, const ARMBaseInstrInfo &TII); -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h index d687568..2edb96a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -281,6 +281,6 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, return true; } -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index f4ec8c6..cb4eeb5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -335,7 +335,7 @@ namespace { } }; char ARMConstantIslands::ID = 0; -} +} // namespace /// verify - check BBOffsets, BBSizes, alignment of islands void ARMConstantIslands::verify() { diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 36f63e2..b429bed 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -44,7 +44,7 @@ namespace ARMCP { GOTTPOFF, TPOFF }; -} +} // namespace ARMCP /// ARMConstantPoolValue - ARM specific constantpool value. This is used to /// represent PC-relative displacement between the address of the load @@ -254,6 +254,6 @@ public: } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 4438f50..963b46c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -69,7 +69,7 @@ namespace { MachineBasicBlock::iterator &MBBI); }; char ARMExpandPseudo::ID = 0; -} +} // namespace /// TransferImpOps - Transfer implicit operands on the pseudo instruction to /// the instructions created from the expansion. @@ -129,7 +129,7 @@ namespace { return PseudoOpc < TE.PseudoOpc; } }; -} +} // namespace static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, false, EvenDblSpc, 1, 4 ,true}, diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index 4175b4a..cead18f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -2898,7 +2898,7 @@ const struct FoldableLoadExtendsStruct { { { ARM::SXTB, ARM::t2SXTB }, 0, 0, MVT::i8 }, { { ARM::UXTB, ARM::t2UXTB }, 0, 1, MVT::i8 } }; -} +} // namespace /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h index 0c910ab..5b4a44c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFeatures.h +++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h @@ -92,6 +92,6 @@ inline bool isV8EligibleForIT(InstrType *Instr) { } } -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp index a52e497..091086d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -221,7 +221,7 @@ struct StackAdjustingInsts { } } }; -} +} // namespace /// Emit an instruction sequence that will align the address in /// register Reg by zero-ing out the lower bits. For versions of the diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h index d763d17..98313e6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -78,6 +78,6 @@ public: MachineBasicBlock::iterator MI) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 50afb19..575a9d9 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -279,7 +279,7 @@ private: SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs, bool is64BitVector); }; -} +} // namespace /// isInt32Immediate - This method tests to see if the node is a 32-bit constant /// operand. If so Imm will receive the 32-bit value. diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 47c8400..94a026b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -83,7 +83,7 @@ namespace { CallOrPrologue = PC; } }; -} +} // namespace // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { @@ -1483,9 +1483,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isSibCall = false; + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); // Disable tail calls if they're not supported. - if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls) + if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; if (isTailCall) { @@ -2042,7 +2043,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - const Triple TT(getTargetMachine().getTargetTriple()); + const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; @@ -2375,7 +2376,9 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; - if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return !Subtarget->isThumb1Only(); @@ -5060,6 +5063,30 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return true; } +/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), +/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. +static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, + unsigned &WhichResult, + bool &isV_UNDEF) { + isV_UNDEF = false; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + isV_UNDEF = true; + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + return 0; +} + /// \return true if this is a reverse operation on an vector. static bool isReverseMask(ArrayRef<int> M, EVT VT) { unsigned NumElts = VT.getVectorNumElements(); @@ -5476,7 +5503,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return true; } - bool ReverseVEXT; + bool ReverseVEXT, isV_UNDEF; unsigned Imm, WhichResult; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); @@ -5487,12 +5514,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isVREVMask(M, VT, 16) || isVEXTMask(M, VT, ReverseVEXT, Imm) || isVTBLMask(M, VT) || - isVTRNMask(M, VT, WhichResult) || - isVUZPMask(M, VT, WhichResult) || - isVZIPMask(M, VT, WhichResult) || - isVTRN_v_undef_Mask(M, VT, WhichResult) || - isVUZP_v_undef_Mask(M, VT, WhichResult) || - isVZIP_v_undef_Mask(M, VT, WhichResult) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); } @@ -5684,25 +5706,53 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult; - if (isVTRNMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVUZPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVZIPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - - if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + bool isV_UNDEF; + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } + + // Also check for these shuffles through CONCAT_VECTORS: we canonicalize + // shuffles that produce a result larger than their operands with: + // shuffle(concat(v1, undef), concat(v2, undef)) + // -> + // shuffle(concat(v1, v2), undef) + // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). + // + // This is useful in the general case, but there are special cases where + // native shuffles produce larger results: the two-result ops. + // + // Look through the concat when lowering them: + // shuffle(concat(v1, v2), undef) + // -> + // concat(VZIP(v1, v2):0, :1) + // + if (V1->getOpcode() == ISD::CONCAT_VECTORS && + V2->getOpcode() == ISD::UNDEF) { + SDValue SubV1 = V1->getOperand(0); + SDValue SubV2 = V1->getOperand(1); + EVT SubVT = SubV1.getValueType(); + + // We expect these to have been canonicalized to -1. + assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { + return i < (int)VT.getVectorNumElements(); + }) && "Unexpected shuffle index into UNDEF operand!"); + + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + SubV2 = SubV1; + assert((WhichResult == 0) && + "In-place shuffle of concat can only have one result!"); + SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), + SubV1, SubV2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), + Res.getValue(1)); + } + } } // If the shuffle is not directly supported and it has 4 elements, use diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index c0b329c..71a47a2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -215,7 +215,7 @@ namespace llvm { VST3LN_UPD, VST4LN_UPD }; - } + } // namespace ARMISD /// Define some predicates that are used for node matching. namespace ARM { @@ -638,6 +638,6 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } -} +} // namespace llvm #endif // ARMISELLOWERING_H diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 84f95be..59e1535 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -198,7 +198,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace char ARMCGBR::ID = 0; FunctionPass* diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h index 90f34ea..9e5700a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -43,6 +43,6 @@ private: Reloc::Model RM) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 46ff326..50e2292 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -142,7 +142,7 @@ namespace { bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; char ARMLoadStoreOpt::ID = 0; -} +} // namespace static bool definesCPSR(const MachineInstr *MI) { for (const auto &MO : MI->operands()) { @@ -1859,7 +1859,7 @@ namespace { bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); }; char ARMPreAllocLoadStoreOpt::ID = 0; -} +} // namespace bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TD = Fn.getTarget().getDataLayout(); diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 14dd9ef..8b12102 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -229,6 +229,6 @@ public: return It; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp index 30baf42..1c8e1f8 100644 --- a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp @@ -32,7 +32,7 @@ public: } }; char ARMOptimizeBarriersPass::ID = 0; -} +} // namespace // Returns whether the instruction can safely move past a DMB instruction // The current implementation allows this iif MI does not have any possible diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h index 1db190f..4563caa 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -70,6 +70,6 @@ public: RTLIB::Libcall LC) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp index f20318d..55808df 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -106,7 +106,7 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, return new ARMFrameLowering(STI); } -ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, +ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle) : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), @@ -187,8 +187,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Insert the architecture feature derived from the target triple into the // feature string. This is important for setting features that are implied // based on the architecture version. - std::string ArchFS = - ARM_MC::ParseARMTriple(TargetTriple.getTriple(), CPUString); + std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple, CPUString); if (!FS.empty()) { if (!ArchFS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); @@ -338,7 +337,7 @@ bool ARMSubtarget::hasSinCos() const { } // This overrides the PostRAScheduler bit in the SchedModel for any CPU. -bool ARMSubtarget::enablePostMachineScheduler() const { +bool ARMSubtarget::enablePostRAScheduler() const { return (!isThumb() || hasThumb2()); } diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index 77ceb08..f00594f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -237,8 +237,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle); + ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const ARMBaseTargetMachine &TM, bool IsLittle); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -430,7 +430,7 @@ public: bool hasSinCos() const; /// True for some subtargets at > -O0. - bool enablePostMachineScheduler() const override; + bool enablePostRAScheduler() const override; // enableAtomicExpand- True if we need to expand our atomics. bool enableAtomicExpand() const override; @@ -453,6 +453,6 @@ public: /// True if fast-isel is used. bool useFastISel() const; }; -} // End llvm namespace +} // namespace llvm #endif // ARMSUBTARGET_H diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 0aceaed..104a34f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -115,11 +115,10 @@ computeTargetABI(const Triple &TT, StringRef CPU, return TargetABI; } -static std::string computeDataLayout(StringRef TT, StringRef CPU, +static std::string computeDataLayout(const Triple &TT, StringRef CPU, const TargetOptions &Options, bool isLittle) { - const Triple Triple(TT); - auto ABI = computeTargetABI(Triple, CPU, Options); + auto ABI = computeTargetABI(TT, CPU, Options); std::string Ret = ""; if (isLittle) @@ -129,7 +128,7 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, // Big endian. Ret += "E"; - Ret += DataLayout::getManglingComponent(Triple); + Ret += DataLayout::getManglingComponent(TT); // Pointers are 32 bits and aligned to 32 bits. Ret += "-p:32:32"; @@ -159,7 +158,7 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit // aligned everywhere else. - if (Triple.isOSNaCl()) + if (TT.isOSNaCl()) Ret += "-S128"; else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS) Ret += "-S64"; @@ -171,15 +170,15 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, /// TargetMachine ctor - Create an ARM architecture model. /// -ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, +ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, RM, CM, OL), - TargetABI(computeTargetABI(Triple(TT), CPU, Options)), - TLOF(createTLOF(Triple(getTargetTriple()))), + TargetABI(computeTargetABI(TT, CPU, Options)), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) { // Default to triple-appropriate float ABI @@ -234,8 +233,9 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { void ARMTargetMachine::anchor() { } -ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, +ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { @@ -247,7 +247,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, void ARMLETargetMachine::anchor() { } -ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT, +ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -256,7 +256,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT, void ARMBETargetMachine::anchor() { } -ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT, +ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -265,19 +265,18 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT, void ThumbTargetMachine::anchor() { } -ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT, +ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, - isLittle) { + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { initAsmInfo(); } void ThumbLETargetMachine::anchor() { } -ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT, +ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -286,7 +285,7 @@ ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT, void ThumbBETargetMachine::anchor() { } -ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, StringRef TT, +ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -355,8 +354,7 @@ bool ARMPassConfig::addPreISel() { bool ARMPassConfig::addInstSelector() { addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); - if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && - TM->Options.EnableFastISel) + if (TM->getTargetTriple().isOSBinFormatELF() && TM->Options.EnableFastISel) addPass(createARMGlobalBaseRegPass()); return false; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h index 20ca97b..8c98e08 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -36,12 +36,10 @@ protected: mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap; public: - ARMBaseTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, + ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); + CodeGenOpt::Level OL, bool isLittle); ~ARMBaseTargetMachine() override; const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } @@ -64,8 +62,8 @@ public: class ARMTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); public: - ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); }; @@ -74,8 +72,8 @@ class ARMTargetMachine : public ARMBaseTargetMachine { class ARMLETargetMachine : public ARMTargetMachine { void anchor() override; public: - ARMLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; @@ -85,9 +83,10 @@ public: class ARMBETargetMachine : public ARMTargetMachine { void anchor() override; public: - ARMBETargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); }; /// ThumbTargetMachine - Thumb target machine. @@ -97,9 +96,10 @@ public: class ThumbTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); public: - ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); + ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, + bool isLittle); }; /// ThumbLETargetMachine - Thumb little endian target machine. @@ -107,7 +107,7 @@ public: class ThumbLETargetMachine : public ThumbTargetMachine { void anchor() override; public: - ThumbLETargetMachine(const Target &T, StringRef TT, StringRef CPU, + ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); @@ -118,7 +118,7 @@ public: class ThumbBETargetMachine : public ThumbTargetMachine { void anchor() override; public: - ThumbBETargetMachine(const Target &T, StringRef TT, StringRef CPU, + ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 8bcbb11..35387d3 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -5841,7 +5841,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // do and don't have a cc_out optional-def operand. With some spot-checks // of the operand list, we can figure out which variant we're trying to // parse and adjust accordingly before actually matching. We shouldn't ever - // try to remove a cc_out operand that was explicitly set on the the + // try to remove a cc_out operand that was explicitly set on the // mnemonic, of course (CarrySetting == true). Reason number #317 the // table driven matcher doesn't fit well with the ARM instruction set. if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 097ec04..f973a8d 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -81,7 +81,7 @@ namespace { private: std::vector<unsigned char> ITStates; }; -} +} // namespace namespace { /// ARM disassembler for all ARM platforms. @@ -118,7 +118,7 @@ private: DecodeStatus AddThumbPredicate(MCInst&) const; void UpdateThumbVFPPredicate(MCInst&) const; }; -} +} // namespace static bool Check(DecodeStatus &Out, DecodeStatus In) { switch (In) { diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index be23e90..1114635 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -744,10 +744,9 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } MCAsmBackend *llvm::createARMAsmBackend(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU, bool isLittle) { - Triple TheTriple(TT); - + const MCRegisterInfo &MRI, + const Triple &TheTriple, StringRef CPU, + bool isLittle) { switch (TheTriple.getObjectFormat()) { default: llvm_unreachable("unsupported object format"); @@ -764,38 +763,38 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S) .Default(MachO::CPU_SUBTYPE_ARM_V7); - return new ARMAsmBackendDarwin(T, TT, CS); + return new ARMAsmBackendDarwin(T, TheTriple, CS); } case Triple::COFF: assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); - return new ARMAsmBackendWinCOFF(T, TT); + return new ARMAsmBackendWinCOFF(T, TheTriple); case Triple::ELF: assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target"); - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS()); - return new ARMAsmBackendELF(T, TT, OSABI, isLittle); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle); } } MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { + const Triple &TT, StringRef CPU) { return createARMAsmBackend(T, MRI, TT, CPU, true); } MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { + const Triple &TT, StringRef CPU) { return createARMAsmBackend(T, MRI, TT, CPU, false); } MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { + const Triple &TT, StringRef CPU) { return createARMAsmBackend(T, MRI, TT, CPU, true); } MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { + const Triple &TT, StringRef CPU) { return createARMAsmBackend(T, MRI, TT, CPU, false); } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 4e60372..6b4abd5 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -23,9 +23,10 @@ class ARMAsmBackend : public MCAsmBackend { bool isThumbMode; // Currently emitting Thumb code. bool IsLittleEndian; // Big or little endian. public: - ARMAsmBackend(const Target &T, StringRef TT, bool IsLittle) + ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle) : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")), - isThumbMode(TT.startswith("thumb")), IsLittleEndian(IsLittle) {} + isThumbMode(TT.getArchName().startswith("thumb")), + IsLittleEndian(IsLittle) {} ~ARMAsmBackend() override { delete STI; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index ebef789..e28f6e0 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -18,7 +18,8 @@ namespace { class ARMAsmBackendDarwin : public ARMAsmBackend { public: const MachO::CPUSubTypeARM Subtype; - ARMAsmBackendDarwin(const Target &T, StringRef TT, MachO::CPUSubTypeARM st) + ARMAsmBackendDarwin(const Target &T, const Triple &TT, + MachO::CPUSubTypeARM st) : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) { HasDataInCodeSupport = true; } @@ -28,6 +29,6 @@ public: Subtype); } }; -} +} // namespace #endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h index 263c4c4..412feb8 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h @@ -15,13 +15,14 @@ namespace { class ARMAsmBackendELF : public ARMAsmBackend { public: uint8_t OSABI; - ARMAsmBackendELF(const Target &T, StringRef TT, uint8_t OSABI, bool IsLittle) + ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI, + bool IsLittle) : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createARMELFObjectWriter(OS, OSABI, isLittle()); } }; -} +} // namespace #endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h index f2c4358..170f59a 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h @@ -15,8 +15,8 @@ using namespace llvm; namespace { class ARMAsmBackendWinCOFF : public ARMAsmBackend { public: - ARMAsmBackendWinCOFF(const Target &T, StringRef Triple) - : ARMAsmBackend(T, Triple, true) {} + ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple) + : ARMAsmBackend(T, TheTriple, true) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false); } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 4289a73..1975bca 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -114,7 +114,7 @@ namespace ARM_PROC { case ID: return "id"; } } -} +} // namespace ARM_PROC namespace ARM_MB { // The Memory Barrier Option constants map directly to the 4-bit encoding of @@ -459,6 +459,6 @@ namespace ARMII { } // end namespace ARMII -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 804d353..9fe27fb 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -40,7 +40,7 @@ namespace { bool needsRelocateWithSymbol(const MCSymbol &Sym, unsigned Type) const override; }; -} +} // namespace ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI) : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 6e3af73..bbc0b37 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1324,7 +1324,7 @@ MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) { MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { - Triple TT(STI.getTargetTriple()); + const Triple &TT = STI.getTargetTriple(); if (TT.getObjectFormat() == Triple::ELF) return new ARMTargetELFStreamer(S); return new ARMTargetStreamer(S); @@ -1345,6 +1345,6 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, return S; } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h index 46ba571..23ef501 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h @@ -104,7 +104,7 @@ enum Fixups { LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; -} -} +} // namespace ARM +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 84bb092..b885783 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -57,7 +57,7 @@ public: return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2]; } bool isTargetMachO(const MCSubtargetInfo &STI) const { - Triple TT(STI.getTargetTriple()); + const Triple &TT = STI.getTargetTriple(); return TT.isOSBinFormatMachO(); } @@ -1065,7 +1065,7 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, // it's just a plain immediate expression, previously those evaluated to // the lower 16 bits of the expression regardless of whether // we have a movt or a movw, but that led to misleadingly results. - // This is now disallowed in the the AsmParser in validateInstruction() + // This is disallowed in the AsmParser in validateInstruction() // so this should never happen. llvm_unreachable("expression without :upper16: or :lower16:"); } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 92c4d6a..0fb395e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -130,16 +130,13 @@ static bool getARMLoadDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, #define GET_SUBTARGETINFO_MC_DESC #include "ARMGenSubtargetInfo.inc" - -std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { - Triple triple(TT); - - bool isThumb = triple.getArch() == Triple::thumb || - triple.getArch() == Triple::thumbeb; +std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { + bool isThumb = + TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb; bool NoCPU = CPU == "generic" || CPU.empty(); std::string ARMArchFeature; - switch (triple.getSubArch()) { + switch (TT.getSubArch()) { default: llvm_unreachable("invalid sub-architecture for ARM"); case Triple::ARMSubArch_v8: @@ -240,7 +237,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { ARMArchFeature += ",+thumb-mode"; } - if (triple.isOSNaCl()) { + if (TT.isOSNaCl()) { if (ARMArchFeature.empty()) ARMArchFeature = "+nacl-trap"; else @@ -250,8 +247,8 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { return ARMArchFeature; } -MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); if (!FS.empty()) { if (!ArchFS.empty()) @@ -332,10 +329,9 @@ static MCInstPrinter *createARMMCInstPrinter(const Triple &T, return nullptr; } -static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT, +static MCRelocationInfo *createARMMCRelocationInfo(const Triple &TT, MCContext &Ctx) { - Triple TheTriple(TT); - if (TheTriple.isOSBinFormatMachO()) + if (TT.isOSBinFormatMachO()) return createARMMachORelocationInfo(Ctx); // Default to the stock relocation info. return llvm::createMCRelocationInfo(TT, Ctx); @@ -374,7 +370,7 @@ public: } }; -} +} // namespace static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) { return new ARMMCInstrAnalysis(Info); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 24ca567..c6f2d13 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -40,12 +40,12 @@ extern Target TheARMLETarget, TheThumbLETarget; extern Target TheARMBETarget, TheThumbBETarget; namespace ARM_MC { - std::string ParseARMTriple(StringRef TT, StringRef CPU); +std::string ParseARMTriple(const Triple &TT, StringRef CPU); - /// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc. - /// do not need to go through TargetRegistry. - MCSubtargetInfo *createARMMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS); +/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc. +/// do not need to go through TargetRegistry. +MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS); } MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S); @@ -65,20 +65,22 @@ MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU, + const Triple &TT, StringRef CPU, bool IsLittleEndian); MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); -MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); +MCAsmBackend *createThumbLEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); -MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); +MCAsmBackend *createThumbBEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); // Construct a PE/COFF machine code streamer which will generate a PE/COFF // object file. @@ -101,7 +103,7 @@ MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, /// Construct ARM Mach-O relocation info. MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx); -} // End llvm namespace +} // namespace llvm // Defines symbolic names for ARM registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 95d7ea7..6ac778e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -56,7 +56,7 @@ public: const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; }; -} +} // namespace static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType, unsigned &Log2Size) { diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index 173cc93..32481e2 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -60,7 +60,7 @@ namespace { EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH); } }; -} +} // namespace void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { if (RegSave == 0u) diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 166c04b..34b552f 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -79,7 +79,7 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target, bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16; } -} +} // namespace namespace llvm { MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index b993b1b..6515a65 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -35,7 +35,7 @@ void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { getAssembler().setIsThumbFunc(Symbol); } -} +} // namespace MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp index ed2deea..ca98f69 100644 --- a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp +++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -71,7 +71,7 @@ namespace { bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); }; char MLxExpansion::ID = 0; -} +} // namespace void MLxExpansion::clearStack() { std::fill(LastMIs, LastMIs + 4, nullptr); diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h index 31d5732..e5e89fa 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -47,6 +47,6 @@ public: MachineBasicBlock::iterator MI) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h index f3f493d..31b4df2 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -58,6 +58,6 @@ private: void expandLoadStackGuard(MachineBasicBlock::iterator MI, Reloc::Model RM) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp index 68736bc..7ce602d 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -48,7 +48,7 @@ namespace { bool InsertITInstructions(MachineBasicBlock &MBB); }; char Thumb2ITBlockPass::ID = 0; -} +} // namespace /// TrackDefUses - Tracking what registers are being defined and used by /// instructions in the IT block. This also tracks "dependencies", i.e. uses diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 916ab06..d186dfb 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -73,6 +73,6 @@ private: ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index d9ab824..0dd1b4c 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -202,7 +202,7 @@ namespace { std::function<bool(const Function &)> PredicateFtor; }; char Thumb2SizeReduce::ID = 0; -} +} // namespace Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) : MachineFunctionPass(ID), PredicateFtor(Ftor) { diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h index 23aaff3..e55f88f 100644 --- a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h @@ -60,6 +60,6 @@ public: int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 10ec658..9d0aa7a9 100644 --- a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -44,7 +44,7 @@ public: const char *Modifier = nullptr); void EmitInstruction(const MachineInstr *MI) override; }; -} +} // namespace void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, const char *Modifier) { diff --git a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h index 3b9fc44..a6fe7c9 100644 --- a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h +++ b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h @@ -37,5 +37,5 @@ public: MBB.erase(MI); } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp index d9e654c..b49de3a 100644 --- a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -51,7 +51,7 @@ private: // Complex Pattern for address selection. bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset); }; -} +} // namespace // ComplexPattern used on BPF Load/Store instructions bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) { diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp index 38c56bb..21d160d 100644 --- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -86,7 +86,7 @@ public: }; int DiagnosticInfoUnsupported::KindID = 0; -} +} // namespace BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, const BPFSubtarget &STI) diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h index ec71dca..b56bb39 100644 --- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h +++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h @@ -85,6 +85,6 @@ private: return true; } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 28bd0ec..83d14ef 100644 --- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -133,7 +133,7 @@ bool BPFInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, unsigned BPFInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h index 4056c2e..bd96f76 100644 --- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h +++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h @@ -51,10 +51,9 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h index 054e894..ba91897 100644 --- a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h +++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h @@ -38,6 +38,6 @@ public: MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h index 7072dd0..44977a2 100644 --- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h +++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h @@ -35,6 +35,6 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo { unsigned getFrameRegister(const MachineFunction &MF) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp index 7f7a262..65acd58 100644 --- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp @@ -25,7 +25,7 @@ using namespace llvm; void BPFSubtarget::anchor() {} -BPFSubtarget::BPFSubtarget(const std::string &TT, const std::string &CPU, +BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this), TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {} diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h index 347cffd..701ac57 100644 --- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h +++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h @@ -38,8 +38,8 @@ class BPFSubtarget : public BPFGenSubtargetInfo { public: // This constructor initializes the data members to match that // of the specified triple. - BPFSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, const TargetMachine &TM); + BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const TargetMachine &TM); // ParseSubtargetFeatures - Parses features string setting specified // subtarget options. Definition of function is auto generated by tblgen. @@ -59,6 +59,6 @@ public: return &InstrInfo.getRegisterInfo(); } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp index 3329d5f..5a888a9 100644 --- a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -29,19 +29,20 @@ extern "C" void LLVMInitializeBPFTarget() { } // DataLayout: little or big endian -static std::string computeDataLayout(StringRef TT) { - if (Triple(TT).getArch() == Triple::bpfeb) +static std::string computeDataLayout(const Triple &TT) { + if (TT.getArch() == Triple::bpfeb) return "E-m:e-p:64:64-i64:64-n32:64-S128"; else return "e-m:e-p:64:64-i64:64-n32:64-S128"; } -BPFTargetMachine::BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, +BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, - Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); @@ -59,7 +60,7 @@ public: bool addInstSelector() override; }; -} +} // namespace TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { return new BPFPassConfig(this, PM); diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h index 6aeafb9..c715fd5 100644 --- a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h +++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h @@ -23,8 +23,8 @@ class BPFTargetMachine : public LLVMTargetMachine { BPFSubtarget Subtarget; public: - BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + BPFTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); const BPFSubtarget *getSubtargetImpl() const { return &Subtarget; } @@ -38,6 +38,6 @@ public: return TLOF.get(); } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h index adcaff6..cb07471 100644 --- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h +++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h @@ -37,6 +37,6 @@ public: void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 7b1d925..33aecb7 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -84,16 +84,16 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { return createBPFELFObjectWriter(OS, 0, IsLittleEndian); } -} +} // namespace MCAsmBackend *llvm::createBPFAsmBackend(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU) { + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { return new BPFAsmBackend(/*IsLittleEndian=*/true); } MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU) { + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { return new BPFAsmBackend(/*IsLittleEndian=*/false); } diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 05ba618..ef4f05f 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -25,7 +25,7 @@ protected: unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} +} // namespace BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI) : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_NONE, diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index d63bbf4..2237654 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -36,6 +36,6 @@ public: HasDotTypeDotSizeDirective = false; } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index dc4ede3..b579afd 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -58,7 +58,7 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; }; -} +} // namespace MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 7cedba9..3e928fc 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -46,8 +46,8 @@ static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createBPFMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitBPFMCSubtargetInfo(X, TT, CPU, FS); return X; diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index a9ba7d9..3d2583a 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -25,8 +25,9 @@ class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; -class Target; class StringRef; +class Target; +class Triple; class raw_ostream; class raw_pwrite_stream; @@ -42,13 +43,13 @@ MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCObjectWriter *createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, bool IsLittleEndian); -} +} // namespace llvm // Defines symbolic names for BPF registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp index b837798..9c9c097 100644 --- a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp +++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp @@ -513,6 +513,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL, HANDLE_ATTR(StackProtect); HANDLE_ATTR(StackProtectReq); HANDLE_ATTR(StackProtectStrong); + HANDLE_ATTR(SafeStack); HANDLE_ATTR(NoCapture); HANDLE_ATTR(NoRedZone); HANDLE_ATTR(NoImplicitFloat); @@ -2148,7 +2149,8 @@ char CppWriter::ID = 0; bool CPPTargetMachine::addPassesToEmitFile( PassManagerBase &PM, raw_pwrite_stream &o, CodeGenFileType FileType, - bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter) { + bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter, + MachineFunctionInitializer *MFInitializer) { if (FileType != TargetMachine::CGFT_AssemblyFile) return true; auto FOut = llvm::make_unique<formatted_raw_ostream>(o); diff --git a/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h index 02d705e..0cd20da 100644 --- a/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h +++ b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h @@ -23,21 +23,21 @@ namespace llvm { class formatted_raw_ostream; struct CPPTargetMachine : public TargetMachine { - CPPTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + CPPTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : TargetMachine(T, "", TT, CPU, FS, Options) {} public: bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType, bool DisableVerify, - AnalysisID StartAfter, - AnalysisID StopAfter) override; + AnalysisID StartAfter, AnalysisID StopAfter, + MachineFunctionInitializer *MFInitializer) override; }; extern Target TheCppBackendTarget; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 14f9d77..837838a 100644 --- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -53,7 +53,7 @@ public: raw_ostream &VStream, raw_ostream &CStream) const override; }; -} +} // namespace static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -69,6 +69,33 @@ static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op, raw_ostream &os); static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst); +static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); + static const uint16_t IntRegDecoderTable[] = { Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4, Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9, @@ -356,6 +383,97 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( return Result; } +static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<16>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<12>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<11>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<12>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<13>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<14>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<10>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, + const void *Decoder) { + uint64_t imm = SignExtend64<8>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<6>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<4>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<5>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<6>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + +static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + uint64_t imm = SignExtend64<7>(tmp); + MI.addOperand(MCOperand::createImm(imm)); + return MCDisassembler::Success; +} + // These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td enum subInstBinaryValues { V4_SA1_addi_BITS = 0x0000, diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h index 6e2ecaf..b24d24a 100644 --- a/contrib/llvm/lib/Target/Hexagon/Hexagon.h +++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h @@ -15,50 +15,6 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H #define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H -#include "MCTargetDesc/HexagonMCTargetDesc.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" - -namespace llvm { - class FunctionPass; - class HexagonAsmPrinter; - class HexagonTargetMachine; - class MachineInstr; - class MCInst; - class ModulePass; - class raw_ostream; - class TargetMachine; - - FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, - CodeGenOpt::Level OptLevel); - FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM); - FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM); - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); - FunctionPass *createHexagonCFGOptimizer(); - - FunctionPass *createHexagonSplitConst32AndConst64(); - FunctionPass *createHexagonExpandPredSpillCode(); - FunctionPass *createHexagonHardwareLoops(); - FunctionPass *createHexagonPeephole(); - FunctionPass *createHexagonFixupHwLoops(); - FunctionPass *createHexagonNewValueJump(); - FunctionPass *createHexagonCopyToCombine(); - FunctionPass *createHexagonPacketizer(); - FunctionPass *createHexagonNewValueJump(); - -/* TODO: object output. - MCCodeEmitter *createHexagonMCCodeEmitter(const Target &, - const TargetMachine &TM, - MCContext &Ctx); -*/ -/* TODO: assembler input. - TargetAsmBackend *createHexagonAsmBackend(const Target &, - const std::string &); -*/ - void HexagonLowerToMC(MachineInstr const *MI, MCInst &MCI, - HexagonAsmPrinter &AP); -} // end namespace llvm; - #define Hexagon_POINTER_SIZE 4 #define Hexagon_PointerSize (Hexagon_POINTER_SIZE) @@ -75,7 +31,7 @@ namespace llvm { // Maximum number of words and instructions in a packet. #define HEXAGON_PACKET_SIZE 4 - +#define HEXAGON_MAX_PACKET_SIZE (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE) // Minimum number of instructions in an end-loop packet. #define HEXAGON_PACKET_INNER_SIZE 2 #define HEXAGON_PACKET_OUTER_SIZE 3 @@ -83,4 +39,25 @@ namespace llvm { // including a compound one or a duplex or an extender. #define HEXAGON_PRESHUFFLE_PACKET_SIZE (HEXAGON_PACKET_SIZE + 3) +// Name of the global offset table as defined by the Hexagon ABI +#define HEXAGON_GOT_SYM_NAME "_GLOBAL_OFFSET_TABLE_" + +#include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + class MachineInstr; + class MCInst; + class MCInstrInfo; + class HexagonAsmPrinter; + class HexagonTargetMachine; + + void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI, + HexagonAsmPrinter &AP); + + /// \brief Creates a Hexagon-specific Target Transformation Info pass. + ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM); +} // namespace llvm + #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h index 792fc8b..f09a5b9 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h @@ -53,6 +53,6 @@ namespace llvm { static const char *getRegisterName(unsigned RegNo); }; -} // end of llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 703e691..ff1a4fe 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -28,6 +28,7 @@ using namespace llvm; #define DEBUG_TYPE "hexagon_cfg" namespace llvm { + FunctionPass *createHexagonCFGOptimizer(); void initializeHexagonCFGOptimizerPass(PassRegistry&); } @@ -227,7 +228,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { } return true; } -} +} // namespace //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index 1d6455c..9fd863f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -49,6 +49,7 @@ MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store", "consider the store still to be newifiable")); namespace llvm { + FunctionPass *createHexagonCopyToCombine(); void initializeHexagonCopyToCombinePass(PassRegistry&); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 37ed173..33766df 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -173,7 +173,7 @@ namespace { bool coalesceRegisters(RegisterRef R1, RegisterRef R2); bool coalesceSegments(MachineFunction &MF); }; -} +} // namespace char HexagonExpandCondsets::ID = 0; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp index 40059fb..1657d88 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp @@ -41,6 +41,7 @@ using namespace llvm; namespace llvm { + FunctionPass *createHexagonExpandPredSpillCode(); void initializeHexagonExpandPredSpillCodePass(PassRegistry&); } @@ -332,7 +333,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { return true; } -} +} // namespace //===----------------------------------------------------------------------===// // Public Constructor Functions diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp index 3d786a9..3ea77cd 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp @@ -30,6 +30,7 @@ static cl::opt<unsigned> MaxLoopRange( cl::desc("Restrict range of loopN instructions (testing only)")); namespace llvm { + FunctionPass *createHexagonFixupHwLoops(); void initializeHexagonFixupHwLoopsPass(PassRegistry&); } @@ -66,7 +67,7 @@ namespace { }; char HexagonFixupHwLoops::ID = 0; -} +} // namespace INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup", "Hexagon Hardware Loops Fixup", false, false) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 868f87e..9797134 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -238,7 +238,7 @@ namespace { return true; return false; } -} +} // namespace /// Implements shrink-wrapping of the stack frame. By default, stack frame diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index 89500cb..767e13c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -99,6 +99,6 @@ private: bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index db72899..53b6bf6 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -63,6 +63,7 @@ static cl::opt<bool> HWCreatePreheader("hexagon-hwloop-preheader", STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); namespace llvm { + FunctionPass *createHexagonHardwareLoops(); void initializeHexagonHardwareLoopsPass(PassRegistry&); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 7a213aa..9123057 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -104,6 +104,7 @@ public: SDNode *SelectConstantFP(SDNode *N); SDNode *SelectAdd(SDNode *N); SDNode *SelectBitOp(SDNode *N); + bool isConstExtProfitable(SDNode *N) const; // XformMskToBitPosU5Imm - Returns the bit position which // the single bit 32 bit mask represents. @@ -1327,6 +1328,20 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } +bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const { + unsigned UseCount = 0; + unsigned CallCount = 0; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + // Ignore call instructions. + if (I->getOpcode() == ISD::CopyToReg) + ++CallCount; + UseCount++; + } + + return (UseCount <= 1) || (CallCount > 1); + +} + void HexagonDAGToDAGISel::PreprocessISelDAG() { SelectionDAG &DAG = *CurDAG; std::vector<SDNode*> Nodes; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 74d92ae..1a14c88 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -95,7 +95,7 @@ public: unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; } }; -} +} // namespace // Implement calling convention for Hexagon. static bool @@ -397,7 +397,9 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { // If either no tail call or told not to tail call at all, don't. - if (!CI->isTailCall() || HTM.Options.DisableTailCalls) + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return true; @@ -486,7 +488,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon); - if (DAG.getTarget().Options.DisableTailCalls) + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (Attr.getValueAsString() == "true") isTailCall = false; if (isTailCall) { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h index b80e847..b9d18df 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -86,7 +86,7 @@ bool isPositiveHalfWord(SDNode *N); OP_END }; - } + } // namespace HexagonISD class HexagonSubtarget; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index e566a97..3cb0823 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -159,7 +159,7 @@ findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, unsigned HexagonInstrInfo::InsertBranch( MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const { + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { Opcode_t BOpc = Hexagon::J2_jump; Opcode_t BccOpc = Hexagon::J2_jumpt; @@ -1013,7 +1013,7 @@ int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const { bool HexagonInstrInfo:: PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Cond) const { + ArrayRef<MachineOperand> Cond) const { if (Cond.empty() || isEndLoopN(Cond[0].getImm())) { DEBUG(dbgs() << "\nCannot predicate:"; MI->dump();); return false; @@ -1162,8 +1162,8 @@ HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, bool HexagonInstrInfo:: -SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const { +SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { // TODO: Fix this return false; } @@ -1982,8 +1982,7 @@ bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const { (Opcode == Hexagon::J2_jumpf); } -bool HexagonInstrInfo::predOpcodeHasNot( - const SmallVectorImpl<MachineOperand> &Cond) const { +bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const { if (Cond.empty() || !isPredicated(Cond[0].getImm())) return false; return !isPredicatedTrue(Cond[0].getImm()); @@ -1994,7 +1993,7 @@ bool HexagonInstrInfo::isEndLoopN(Opcode_t Opcode) const { Opcode == Hexagon::ENDLOOP1); } -bool HexagonInstrInfo::getPredReg(const SmallVectorImpl<MachineOperand> &Cond, +bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const { if (Cond.empty()) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index a7ae65e..91f508e 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -69,8 +69,7 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; bool analyzeCompare(const MachineInstr *MI, @@ -129,7 +128,7 @@ public: bool isBranch(const MachineInstr *MI) const; bool isPredicable(MachineInstr *MI) const override; bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Cond) const override; + ArrayRef<MachineOperand> Cond) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, @@ -149,8 +148,8 @@ public: bool isPredicatedNew(unsigned Opcode) const; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; - bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const override; + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; @@ -222,15 +221,14 @@ public: bool NonExtEquivalentExists (const MachineInstr *MI) const; short getNonExtOpcode(const MachineInstr *MI) const; bool PredOpcodeHasJMP_c(Opcode_t Opcode) const; - bool predOpcodeHasNot(const SmallVectorImpl<MachineOperand> &Cond) const; + bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; bool isEndLoopN(Opcode_t Opcode) const; - bool getPredReg(const SmallVectorImpl<MachineOperand> &Cond, - unsigned &PredReg, unsigned &PredRegPos, - unsigned &PredRegFlags) const; + bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg, + unsigned &PredRegPos, unsigned &PredRegFlags) const; int getCondOpcode(int Opc, bool sense) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td index 4275230..1d0d015 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td @@ -676,6 +676,7 @@ def : Pat <(int_hexagon_A2_tfrih (I32:$Rs), u16_0ImmPred:$Is), // Transfer Register/immediate. def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>; def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>; +def : T_I_pat <A2_tfrpi, int_hexagon_A2_tfrpi>; // Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32) def : Pat<(int_hexagon_A2_tfrp DoubleRegs:$src), @@ -690,15 +691,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>; def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>; def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>; -def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>; +def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>; def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))), (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>; // Mux -def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>; -def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>; -def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>; +def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>; +def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>; +def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>; // Shift halfword def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>; @@ -719,17 +720,17 @@ def : T_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>; def : T_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>; def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>; -def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>; -def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>; -def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>; +def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>; +def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>; +def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>; -def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)), (i32 (C2_cmpgti (I32:$src1), - (DEC_CONST_SIGNED s32ImmPred:$src2)))>; + (DEC_CONST_SIGNED s8ExtPred:$src2)))>; -def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)), (i32 (C2_cmpgtui (I32:$src1), - (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>; + (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>; // The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0. def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)), @@ -923,6 +924,10 @@ def: qi_CRInst_qiqi_pat<C2_or, int_hexagon_C2_or>; def: qi_CRInst_qiqi_pat<C2_orn, int_hexagon_C2_orn>; def: qi_CRInst_qiqi_pat<C2_xor, int_hexagon_C2_xor>; +// Assembler mapped from Pd4=Ps4 to Pd4=or(Ps4,Ps4) +def : Pat<(int_hexagon_C2_pxfer_map PredRegs:$src), + (C2_pxfer_map PredRegs:$src)>; + // Multiply 32x32 and use lower result def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>; def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h index 7672358..5681ae2 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h @@ -80,6 +80,6 @@ public: void setStackAlignBaseVReg(unsigned R) { StackAlignBaseReg = R; } unsigned getStackAlignBaseVReg() const { return StackAlignBaseReg; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h index 6034344..fae16e2 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -238,7 +238,7 @@ protected: #endif }; -} // namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp index 81af4db..707bfdb 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -60,6 +60,7 @@ static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden, cl::desc("Disable New Value Jumps")); namespace llvm { + FunctionPass *createHexagonNewValueJump(); void initializeHexagonNewValueJumpPass(PassRegistry&); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td index be8204b..2bece8f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td @@ -7,32 +7,24 @@ // //===----------------------------------------------------------------------===// +def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; } +def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; } +def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; } +def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; } + // Immediate operands. let PrintMethod = "printImmOperand" in { - // f32Ext type is used to identify constant extended floating point immediates. - def f32Ext : Operand<f32>; def s32Imm : Operand<i32>; - def s26_6Imm : Operand<i32>; - def s16Imm : Operand<i32>; - def s12Imm : Operand<i32>; - def s11Imm : Operand<i32>; - def s11_0Imm : Operand<i32>; - def s11_1Imm : Operand<i32>; - def s11_2Imm : Operand<i32>; - def s11_3Imm : Operand<i32>; - def s10Imm : Operand<i32>; - def s9Imm : Operand<i32>; - def m9Imm : Operand<i32>; def s8Imm : Operand<i32>; def s8Imm64 : Operand<i64>; def s6Imm : Operand<i32>; def s6_3Imm : Operand<i32>; def s4Imm : Operand<i32>; - def s4_0Imm : Operand<i32>; - def s4_1Imm : Operand<i32>; - def s4_2Imm : Operand<i32>; - def s4_3Imm : Operand<i32>; + def s4_0Imm : Operand<i32> { let DecoderMethod = "s4_0ImmDecoder"; } + def s4_1Imm : Operand<i32> { let DecoderMethod = "s4_1ImmDecoder"; } + def s4_2Imm : Operand<i32> { let DecoderMethod = "s4_2ImmDecoder"; } + def s4_3Imm : Operand<i32> { let DecoderMethod = "s4_3ImmDecoder"; } def u64Imm : Operand<i64>; def u32Imm : Operand<i32>; def u26_6Imm : Operand<i32>; @@ -446,17 +438,18 @@ def SetClr3ImmPred : PatLeaf<(i32 imm), [{ // Extendable immediate operands. let PrintMethod = "printExtOperand" in { - def s16Ext : Operand<i32>; - def s12Ext : Operand<i32>; - def s10Ext : Operand<i32>; - def s9Ext : Operand<i32>; - def s8Ext : Operand<i32>; + def f32Ext : Operand<f32>; + def s16Ext : Operand<i32> { let DecoderMethod = "s16ImmDecoder"; } + def s12Ext : Operand<i32> { let DecoderMethod = "s12ImmDecoder"; } + def s11_0Ext : Operand<i32> { let DecoderMethod = "s11_0ImmDecoder"; } + def s11_1Ext : Operand<i32> { let DecoderMethod = "s11_1ImmDecoder"; } + def s11_2Ext : Operand<i32> { let DecoderMethod = "s11_2ImmDecoder"; } + def s11_3Ext : Operand<i32> { let DecoderMethod = "s11_3ImmDecoder"; } + def s10Ext : Operand<i32> { let DecoderMethod = "s10ImmDecoder"; } + def s9Ext : Operand<i32> { let DecoderMethod = "s90ImmDecoder"; } + def s8Ext : Operand<i32> { let DecoderMethod = "s8ImmDecoder"; } def s7Ext : Operand<i32>; - def s6Ext : Operand<i32>; - def s11_0Ext : Operand<i32>; - def s11_1Ext : Operand<i32>; - def s11_2Ext : Operand<i32>; - def s11_3Ext : Operand<i32>; + def s6Ext : Operand<i32> { let DecoderMethod = "s6_0ImmDecoder"; } def u6Ext : Operand<i32>; def u7Ext : Operand<i32>; def u8Ext : Operand<i32>; @@ -468,6 +461,46 @@ let PrintMethod = "printExtOperand" in { def u6_3Ext : Operand<i32>; } +def s10ExtPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (isInt<10>(v)) + return true; + + // Return true if extending this immediate is profitable and the value + // can fit in a 32-bit signed field. + return isConstExtProfitable(Node) && isInt<32>(v); +}]>; + +def s8ExtPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (isInt<8>(v)) + return true; + + // Return true if extending this immediate is profitable and the value + // can fit in a 32-bit signed field. + return isConstExtProfitable(Node) && isInt<32>(v); +}]>; + +def u8ExtPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (isUInt<8>(v)) + return true; + + // Return true if extending this immediate is profitable and the value + // can fit in a 32-bit unsigned field. + return isConstExtProfitable(Node) && isUInt<32>(v); +}]>; + +def u9ExtPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (isUInt<9>(v)) + return true; + + // Return true if extending this immediate is profitable and the value + // can fit in a 32-bit unsigned field. + return isConstExtProfitable(Node) && isUInt<32>(v); +}]>; + // This complex pattern exists only to create a machine instruction operand // of type "frame index". There doesn't seem to be a way to do that directly diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp index 503bfdb..94ec2e7 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp @@ -75,6 +75,7 @@ static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64", cl::desc("Disable Optimization of extensions to i64.")); namespace llvm { + FunctionPass *createHexagonPeephole(); void initializeHexagonPeepholePass(PassRegistry&); } @@ -103,7 +104,7 @@ namespace { private: void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src); }; -} +} // namespace char HexagonPeephole::ID = 0; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp index 0c24075..d586c39 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp @@ -24,6 +24,7 @@ using namespace llvm; namespace llvm { + FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); void initializeHexagonRemoveExtendArgsPass(PassRegistry&); } @@ -47,7 +48,7 @@ namespace { FunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace char HexagonRemoveExtendArgs::ID = 0; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h index 8ac2e43..c72051c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h @@ -32,6 +32,6 @@ public: MachinePointerInfo SrcPtrInfo) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index 4efb5f7..61bb7c5 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -45,6 +45,11 @@ using namespace llvm; #define DEBUG_TYPE "xfer" +namespace llvm { + FunctionPass *createHexagonSplitConst32AndConst64(); + void initializeHexagonSplitConst32AndConst64Pass(PassRegistry&); +} + namespace { class HexagonSplitConst32AndConst64 : public MachineFunctionPass { @@ -151,7 +156,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { return true; } -} +} // namespace //===----------------------------------------------------------------------===// // Public Constructor Functions diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index d61cc54..fe6c4f4 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -70,8 +70,8 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { return *this; } -HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, - const TargetMachine &TM) +HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, + StringRef FS, const TargetMachine &TM) : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 780567b..34cdad7 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -52,7 +52,7 @@ private: InstrItineraryData InstrItins; public: - HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, + HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM); /// getInstrItins - Return the instruction itineraries based on subtarget diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0679866..90f1ced 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -61,14 +61,30 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", namespace llvm { FunctionPass *createHexagonExpandCondsets(); -} + FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, + CodeGenOpt::Level OptLevel); + FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM); + FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM); + FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); + FunctionPass *createHexagonCFGOptimizer(); + + FunctionPass *createHexagonSplitConst32AndConst64(); + FunctionPass *createHexagonExpandPredSpillCode(); + FunctionPass *createHexagonHardwareLoops(); + FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonFixupHwLoops(); + FunctionPass *createHexagonNewValueJump(); + FunctionPass *createHexagonCopyToCombine(); + FunctionPass *createHexagonPacketizer(); + FunctionPass *createHexagonNewValueJump(); +} // namespace llvm /// HexagonTargetMachine ctor - Create an ILP32 architecture model. /// /// Hexagon_TODO: Do I need an aggregate alignment? /// -HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT, +HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index 5774f7e..115eadb 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -27,7 +27,7 @@ class HexagonTargetMachine : public LLVMTargetMachine { HexagonSubtarget Subtarget; public: - HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU, + HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h new file mode 100644 index 0000000..2b4a3ad --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h @@ -0,0 +1,31 @@ +//===-- HexagonTargetStreamer.h - Hexagon Target Streamer ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONTARGETSTREAMER_H +#define HEXAGONTARGETSTREAMER_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { +class HexagonTargetStreamer : public MCTargetStreamer { +public: + HexagonTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + virtual void EmitCodeAlignment(unsigned ByteAlignment, + unsigned MaxBytesToEmit = 0){}; + virtual void emitFAlign(unsigned Size, unsigned MaxBytesToEmit){}; + virtual void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, + unsigned AccessGranularity){}; + virtual void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlign, + unsigned AccessGranularity){}; +}; +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 0cc59bc..66fdd65 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -57,6 +57,7 @@ static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles", cl::desc("Allow non-solo packetization of volatile memory references")); namespace llvm { + FunctionPass *createHexagonPacketizer(); void initializeHexagonPacketizerPass(PassRegistry&); } @@ -169,7 +170,7 @@ namespace { void reserveResourcesForConstExt(MachineInstr* MI); bool isNewValueInst(MachineInstr* MI); }; -} +} // namespace INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", false, false) diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 7689484..99ea2fa 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -17,11 +17,14 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; using namespace Hexagon; +#define DEBUG_TYPE "hexagon-asm-backend" + namespace { class HexagonAsmBackend : public MCAsmBackend { @@ -278,8 +281,26 @@ public: llvm_unreachable("relaxInstruction() unimplemented"); } - bool writeNopData(uint64_t /*Count*/, - MCObjectWriter * /*OW*/) const override { + bool writeNopData(uint64_t Count, + MCObjectWriter * OW) const override { + static const uint32_t Nopcode = 0x7f000000, // Hard-coded NOP. + ParseIn = 0x00004000, // In packet parse-bits. + ParseEnd = 0x0000c000; // End of packet parse-bits. + + while(Count % HEXAGON_INSTR_SIZE) { + DEBUG(dbgs() << "Alignment not a multiple of the instruction size:" << + Count % HEXAGON_INSTR_SIZE << "/" << HEXAGON_INSTR_SIZE << "\n"); + --Count; + OW->write8(0); + } + + while(Count) { + Count -= HEXAGON_INSTR_SIZE; + // Close the packet whenever a multiple of the maximum packet size remains + uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))? + ParseIn: ParseEnd; + OW->write32(Nopcode | ParseBits); + } return true; } }; @@ -288,8 +309,8 @@ public: namespace llvm { MCAsmBackend *createHexagonAsmBackend(Target const &T, MCRegisterInfo const & /*MRI*/, - StringRef TT, StringRef CPU) { - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS()); + const Triple &TT, StringRef CPU) { + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); return new HexagonAsmBackend(T, OSABI, CPU); } } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index 8430723..0f7cf0e 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -31,318 +31,216 @@ public: unsigned GetRelocType(MCValue const &Target, MCFixup const &Fixup, bool IsPCRel) const override; }; -} +} // namespace HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C) : MCELFObjectTargetWriter(/*Is64bit*/ false, OSABI, ELF::EM_HEXAGON, /*HasRelocationAddend*/ true), CPU(C) {} -unsigned HexagonELFObjectWriter::GetRelocType(MCValue const &/*Target*/, +unsigned HexagonELFObjectWriter::GetRelocType(MCValue const & /*Target*/, MCFixup const &Fixup, bool IsPCRel) const { - // determine the type of the relocation - unsigned Type = (unsigned)ELF::R_HEX_NONE; - unsigned Kind = (unsigned)Fixup.getKind(); - - switch (Kind) { - default: - DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n"); - llvm_unreachable("Unimplemented Fixup kind!"); - break; - case FK_Data_4: - Type = (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32; - break; - case FK_PCRel_4: - Type = ELF::R_HEX_32_PCREL; - break; - case FK_Data_2: - Type = ELF::R_HEX_16; - break; - case FK_Data_1: - Type = ELF::R_HEX_8; - break; - case fixup_Hexagon_B22_PCREL: - Type = ELF::R_HEX_B22_PCREL; - break; - case fixup_Hexagon_B15_PCREL: - Type = ELF::R_HEX_B15_PCREL; - break; - case fixup_Hexagon_B7_PCREL: - Type = ELF::R_HEX_B7_PCREL; - break; - case fixup_Hexagon_LO16: - Type = ELF::R_HEX_LO16; - break; - case fixup_Hexagon_HI16: - Type = ELF::R_HEX_HI16; - break; - case fixup_Hexagon_32: - Type = ELF::R_HEX_32; - break; - case fixup_Hexagon_16: - Type = ELF::R_HEX_16; - break; - case fixup_Hexagon_8: - Type = ELF::R_HEX_8; - break; - case fixup_Hexagon_GPREL16_0: - Type = ELF::R_HEX_GPREL16_0; - break; - case fixup_Hexagon_GPREL16_1: - Type = ELF::R_HEX_GPREL16_1; - break; - case fixup_Hexagon_GPREL16_2: - Type = ELF::R_HEX_GPREL16_2; - break; - case fixup_Hexagon_GPREL16_3: - Type = ELF::R_HEX_GPREL16_3; - break; - case fixup_Hexagon_HL16: - Type = ELF::R_HEX_HL16; - break; - case fixup_Hexagon_B13_PCREL: - Type = ELF::R_HEX_B13_PCREL; - break; - case fixup_Hexagon_B9_PCREL: - Type = ELF::R_HEX_B9_PCREL; - break; - case fixup_Hexagon_B32_PCREL_X: - Type = ELF::R_HEX_B32_PCREL_X; - break; - case fixup_Hexagon_32_6_X: - Type = ELF::R_HEX_32_6_X; - break; - case fixup_Hexagon_B22_PCREL_X: - Type = ELF::R_HEX_B22_PCREL_X; - break; - case fixup_Hexagon_B15_PCREL_X: - Type = ELF::R_HEX_B15_PCREL_X; - break; - case fixup_Hexagon_B13_PCREL_X: - Type = ELF::R_HEX_B13_PCREL_X; - break; - case fixup_Hexagon_B9_PCREL_X: - Type = ELF::R_HEX_B9_PCREL_X; - break; - case fixup_Hexagon_B7_PCREL_X: - Type = ELF::R_HEX_B7_PCREL_X; - break; - case fixup_Hexagon_16_X: - Type = ELF::R_HEX_16_X; - break; - case fixup_Hexagon_12_X: - Type = ELF::R_HEX_12_X; - break; - case fixup_Hexagon_11_X: - Type = ELF::R_HEX_11_X; - break; - case fixup_Hexagon_10_X: - Type = ELF::R_HEX_10_X; - break; - case fixup_Hexagon_9_X: - Type = ELF::R_HEX_9_X; - break; - case fixup_Hexagon_8_X: - Type = ELF::R_HEX_8_X; - break; - case fixup_Hexagon_7_X: - Type = ELF::R_HEX_7_X; - break; - case fixup_Hexagon_6_X: - Type = ELF::R_HEX_6_X; - break; - case fixup_Hexagon_32_PCREL: - Type = ELF::R_HEX_32_PCREL; - break; - case fixup_Hexagon_COPY: - Type = ELF::R_HEX_COPY; - break; - case fixup_Hexagon_GLOB_DAT: - Type = ELF::R_HEX_GLOB_DAT; - break; - case fixup_Hexagon_JMP_SLOT: - Type = ELF::R_HEX_JMP_SLOT; - break; - case fixup_Hexagon_RELATIVE: - Type = ELF::R_HEX_RELATIVE; - break; - case fixup_Hexagon_PLT_B22_PCREL: - Type = ELF::R_HEX_PLT_B22_PCREL; - break; - case fixup_Hexagon_GOTREL_LO16: - Type = ELF::R_HEX_GOTREL_LO16; - break; - case fixup_Hexagon_GOTREL_HI16: - Type = ELF::R_HEX_GOTREL_HI16; - break; - case fixup_Hexagon_GOTREL_32: - Type = ELF::R_HEX_GOTREL_32; - break; - case fixup_Hexagon_GOT_LO16: - Type = ELF::R_HEX_GOT_LO16; - break; - case fixup_Hexagon_GOT_HI16: - Type = ELF::R_HEX_GOT_HI16; - break; - case fixup_Hexagon_GOT_32: - Type = ELF::R_HEX_GOT_32; - break; - case fixup_Hexagon_GOT_16: - Type = ELF::R_HEX_GOT_16; - break; - case fixup_Hexagon_DTPMOD_32: - Type = ELF::R_HEX_DTPMOD_32; - break; - case fixup_Hexagon_DTPREL_LO16: - Type = ELF::R_HEX_DTPREL_LO16; - break; - case fixup_Hexagon_DTPREL_HI16: - Type = ELF::R_HEX_DTPREL_HI16; - break; - case fixup_Hexagon_DTPREL_32: - Type = ELF::R_HEX_DTPREL_32; - break; - case fixup_Hexagon_DTPREL_16: - Type = ELF::R_HEX_DTPREL_16; - break; - case fixup_Hexagon_GD_PLT_B22_PCREL: - Type = ELF::R_HEX_GD_PLT_B22_PCREL; - break; - case fixup_Hexagon_LD_PLT_B22_PCREL: - Type = ELF::R_HEX_LD_PLT_B22_PCREL; - break; - case fixup_Hexagon_GD_GOT_LO16: - Type = ELF::R_HEX_GD_GOT_LO16; - break; - case fixup_Hexagon_GD_GOT_HI16: - Type = ELF::R_HEX_GD_GOT_HI16; - break; - case fixup_Hexagon_GD_GOT_32: - Type = ELF::R_HEX_GD_GOT_32; - break; - case fixup_Hexagon_GD_GOT_16: - Type = ELF::R_HEX_GD_GOT_16; - break; - case fixup_Hexagon_LD_GOT_LO16: - Type = ELF::R_HEX_LD_GOT_LO16; - break; - case fixup_Hexagon_LD_GOT_HI16: - Type = ELF::R_HEX_LD_GOT_HI16; - break; - case fixup_Hexagon_LD_GOT_32: - Type = ELF::R_HEX_LD_GOT_32; - break; - case fixup_Hexagon_LD_GOT_16: - Type = ELF::R_HEX_LD_GOT_16; - break; - case fixup_Hexagon_IE_LO16: - Type = ELF::R_HEX_IE_LO16; - break; - case fixup_Hexagon_IE_HI16: - Type = ELF::R_HEX_IE_HI16; - break; - case fixup_Hexagon_IE_32: - Type = ELF::R_HEX_IE_32; - break; - case fixup_Hexagon_IE_GOT_LO16: - Type = ELF::R_HEX_IE_GOT_LO16; - break; - case fixup_Hexagon_IE_GOT_HI16: - Type = ELF::R_HEX_IE_GOT_HI16; - break; - case fixup_Hexagon_IE_GOT_32: - Type = ELF::R_HEX_IE_GOT_32; - break; - case fixup_Hexagon_IE_GOT_16: - Type = ELF::R_HEX_IE_GOT_16; - break; - case fixup_Hexagon_TPREL_LO16: - Type = ELF::R_HEX_TPREL_LO16; - break; - case fixup_Hexagon_TPREL_HI16: - Type = ELF::R_HEX_TPREL_HI16; - break; - case fixup_Hexagon_TPREL_32: - Type = ELF::R_HEX_TPREL_32; - break; - case fixup_Hexagon_TPREL_16: - Type = ELF::R_HEX_TPREL_16; - break; - case fixup_Hexagon_6_PCREL_X: - Type = ELF::R_HEX_6_PCREL_X; - break; - case fixup_Hexagon_GOTREL_32_6_X: - Type = ELF::R_HEX_GOTREL_32_6_X; - break; - case fixup_Hexagon_GOTREL_16_X: - Type = ELF::R_HEX_GOTREL_16_X; - break; - case fixup_Hexagon_GOTREL_11_X: - Type = ELF::R_HEX_GOTREL_11_X; - break; - case fixup_Hexagon_GOT_32_6_X: - Type = ELF::R_HEX_GOT_32_6_X; - break; - case fixup_Hexagon_GOT_16_X: - Type = ELF::R_HEX_GOT_16_X; - break; - case fixup_Hexagon_GOT_11_X: - Type = ELF::R_HEX_GOT_11_X; - break; - case fixup_Hexagon_DTPREL_32_6_X: - Type = ELF::R_HEX_DTPREL_32_6_X; - break; - case fixup_Hexagon_DTPREL_16_X: - Type = ELF::R_HEX_DTPREL_16_X; - break; - case fixup_Hexagon_DTPREL_11_X: - Type = ELF::R_HEX_DTPREL_11_X; - break; - case fixup_Hexagon_GD_GOT_32_6_X: - Type = ELF::R_HEX_GD_GOT_32_6_X; - break; - case fixup_Hexagon_GD_GOT_16_X: - Type = ELF::R_HEX_GD_GOT_16_X; - break; - case fixup_Hexagon_GD_GOT_11_X: - Type = ELF::R_HEX_GD_GOT_11_X; - break; - case fixup_Hexagon_LD_GOT_32_6_X: - Type = ELF::R_HEX_LD_GOT_32_6_X; - break; - case fixup_Hexagon_LD_GOT_16_X: - Type = ELF::R_HEX_LD_GOT_16_X; - break; - case fixup_Hexagon_LD_GOT_11_X: - Type = ELF::R_HEX_LD_GOT_11_X; - break; - case fixup_Hexagon_IE_32_6_X: - Type = ELF::R_HEX_IE_32_6_X; - break; - case fixup_Hexagon_IE_16_X: - Type = ELF::R_HEX_IE_16_X; - break; - case fixup_Hexagon_IE_GOT_32_6_X: - Type = ELF::R_HEX_IE_GOT_32_6_X; - break; - case fixup_Hexagon_IE_GOT_16_X: - Type = ELF::R_HEX_IE_GOT_16_X; - break; - case fixup_Hexagon_IE_GOT_11_X: - Type = ELF::R_HEX_IE_GOT_11_X; - break; - case fixup_Hexagon_TPREL_32_6_X: - Type = ELF::R_HEX_TPREL_32_6_X; - break; - case fixup_Hexagon_TPREL_16_X: - Type = ELF::R_HEX_TPREL_16_X; - break; - case fixup_Hexagon_TPREL_11_X: - Type = ELF::R_HEX_TPREL_11_X; - break; + switch ((unsigned)Fixup.getKind()) { + default: + DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n"); + llvm_unreachable("Unimplemented Fixup kind!"); + return ELF::R_HEX_NONE; + case FK_Data_4: + return (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32; + case FK_PCRel_4: + return ELF::R_HEX_32_PCREL; + case FK_Data_2: + return ELF::R_HEX_16; + case FK_Data_1: + return ELF::R_HEX_8; + case fixup_Hexagon_B22_PCREL: + return ELF::R_HEX_B22_PCREL; + case fixup_Hexagon_B15_PCREL: + return ELF::R_HEX_B15_PCREL; + case fixup_Hexagon_B7_PCREL: + return ELF::R_HEX_B7_PCREL; + case fixup_Hexagon_LO16: + return ELF::R_HEX_LO16; + case fixup_Hexagon_HI16: + return ELF::R_HEX_HI16; + case fixup_Hexagon_32: + return ELF::R_HEX_32; + case fixup_Hexagon_16: + return ELF::R_HEX_16; + case fixup_Hexagon_8: + return ELF::R_HEX_8; + case fixup_Hexagon_GPREL16_0: + return ELF::R_HEX_GPREL16_0; + case fixup_Hexagon_GPREL16_1: + return ELF::R_HEX_GPREL16_1; + case fixup_Hexagon_GPREL16_2: + return ELF::R_HEX_GPREL16_2; + case fixup_Hexagon_GPREL16_3: + return ELF::R_HEX_GPREL16_3; + case fixup_Hexagon_HL16: + return ELF::R_HEX_HL16; + case fixup_Hexagon_B13_PCREL: + return ELF::R_HEX_B13_PCREL; + case fixup_Hexagon_B9_PCREL: + return ELF::R_HEX_B9_PCREL; + case fixup_Hexagon_B32_PCREL_X: + return ELF::R_HEX_B32_PCREL_X; + case fixup_Hexagon_32_6_X: + return ELF::R_HEX_32_6_X; + case fixup_Hexagon_B22_PCREL_X: + return ELF::R_HEX_B22_PCREL_X; + case fixup_Hexagon_B15_PCREL_X: + return ELF::R_HEX_B15_PCREL_X; + case fixup_Hexagon_B13_PCREL_X: + return ELF::R_HEX_B13_PCREL_X; + case fixup_Hexagon_B9_PCREL_X: + return ELF::R_HEX_B9_PCREL_X; + case fixup_Hexagon_B7_PCREL_X: + return ELF::R_HEX_B7_PCREL_X; + case fixup_Hexagon_16_X: + return ELF::R_HEX_16_X; + case fixup_Hexagon_12_X: + return ELF::R_HEX_12_X; + case fixup_Hexagon_11_X: + return ELF::R_HEX_11_X; + case fixup_Hexagon_10_X: + return ELF::R_HEX_10_X; + case fixup_Hexagon_9_X: + return ELF::R_HEX_9_X; + case fixup_Hexagon_8_X: + return ELF::R_HEX_8_X; + case fixup_Hexagon_7_X: + return ELF::R_HEX_7_X; + case fixup_Hexagon_6_X: + return ELF::R_HEX_6_X; + case fixup_Hexagon_32_PCREL: + return ELF::R_HEX_32_PCREL; + case fixup_Hexagon_COPY: + return ELF::R_HEX_COPY; + case fixup_Hexagon_GLOB_DAT: + return ELF::R_HEX_GLOB_DAT; + case fixup_Hexagon_JMP_SLOT: + return ELF::R_HEX_JMP_SLOT; + case fixup_Hexagon_RELATIVE: + return ELF::R_HEX_RELATIVE; + case fixup_Hexagon_PLT_B22_PCREL: + return ELF::R_HEX_PLT_B22_PCREL; + case fixup_Hexagon_GOTREL_LO16: + return ELF::R_HEX_GOTREL_LO16; + case fixup_Hexagon_GOTREL_HI16: + return ELF::R_HEX_GOTREL_HI16; + case fixup_Hexagon_GOTREL_32: + return ELF::R_HEX_GOTREL_32; + case fixup_Hexagon_GOT_LO16: + return ELF::R_HEX_GOT_LO16; + case fixup_Hexagon_GOT_HI16: + return ELF::R_HEX_GOT_HI16; + case fixup_Hexagon_GOT_32: + return ELF::R_HEX_GOT_32; + case fixup_Hexagon_GOT_16: + return ELF::R_HEX_GOT_16; + case fixup_Hexagon_DTPMOD_32: + return ELF::R_HEX_DTPMOD_32; + case fixup_Hexagon_DTPREL_LO16: + return ELF::R_HEX_DTPREL_LO16; + case fixup_Hexagon_DTPREL_HI16: + return ELF::R_HEX_DTPREL_HI16; + case fixup_Hexagon_DTPREL_32: + return ELF::R_HEX_DTPREL_32; + case fixup_Hexagon_DTPREL_16: + return ELF::R_HEX_DTPREL_16; + case fixup_Hexagon_GD_PLT_B22_PCREL: + return ELF::R_HEX_GD_PLT_B22_PCREL; + case fixup_Hexagon_LD_PLT_B22_PCREL: + return ELF::R_HEX_LD_PLT_B22_PCREL; + case fixup_Hexagon_GD_GOT_LO16: + return ELF::R_HEX_GD_GOT_LO16; + case fixup_Hexagon_GD_GOT_HI16: + return ELF::R_HEX_GD_GOT_HI16; + case fixup_Hexagon_GD_GOT_32: + return ELF::R_HEX_GD_GOT_32; + case fixup_Hexagon_GD_GOT_16: + return ELF::R_HEX_GD_GOT_16; + case fixup_Hexagon_LD_GOT_LO16: + return ELF::R_HEX_LD_GOT_LO16; + case fixup_Hexagon_LD_GOT_HI16: + return ELF::R_HEX_LD_GOT_HI16; + case fixup_Hexagon_LD_GOT_32: + return ELF::R_HEX_LD_GOT_32; + case fixup_Hexagon_LD_GOT_16: + return ELF::R_HEX_LD_GOT_16; + case fixup_Hexagon_IE_LO16: + return ELF::R_HEX_IE_LO16; + case fixup_Hexagon_IE_HI16: + return ELF::R_HEX_IE_HI16; + case fixup_Hexagon_IE_32: + return ELF::R_HEX_IE_32; + case fixup_Hexagon_IE_GOT_LO16: + return ELF::R_HEX_IE_GOT_LO16; + case fixup_Hexagon_IE_GOT_HI16: + return ELF::R_HEX_IE_GOT_HI16; + case fixup_Hexagon_IE_GOT_32: + return ELF::R_HEX_IE_GOT_32; + case fixup_Hexagon_IE_GOT_16: + return ELF::R_HEX_IE_GOT_16; + case fixup_Hexagon_TPREL_LO16: + return ELF::R_HEX_TPREL_LO16; + case fixup_Hexagon_TPREL_HI16: + return ELF::R_HEX_TPREL_HI16; + case fixup_Hexagon_TPREL_32: + return ELF::R_HEX_TPREL_32; + case fixup_Hexagon_TPREL_16: + return ELF::R_HEX_TPREL_16; + case fixup_Hexagon_6_PCREL_X: + return ELF::R_HEX_6_PCREL_X; + case fixup_Hexagon_GOTREL_32_6_X: + return ELF::R_HEX_GOTREL_32_6_X; + case fixup_Hexagon_GOTREL_16_X: + return ELF::R_HEX_GOTREL_16_X; + case fixup_Hexagon_GOTREL_11_X: + return ELF::R_HEX_GOTREL_11_X; + case fixup_Hexagon_GOT_32_6_X: + return ELF::R_HEX_GOT_32_6_X; + case fixup_Hexagon_GOT_16_X: + return ELF::R_HEX_GOT_16_X; + case fixup_Hexagon_GOT_11_X: + return ELF::R_HEX_GOT_11_X; + case fixup_Hexagon_DTPREL_32_6_X: + return ELF::R_HEX_DTPREL_32_6_X; + case fixup_Hexagon_DTPREL_16_X: + return ELF::R_HEX_DTPREL_16_X; + case fixup_Hexagon_DTPREL_11_X: + return ELF::R_HEX_DTPREL_11_X; + case fixup_Hexagon_GD_GOT_32_6_X: + return ELF::R_HEX_GD_GOT_32_6_X; + case fixup_Hexagon_GD_GOT_16_X: + return ELF::R_HEX_GD_GOT_16_X; + case fixup_Hexagon_GD_GOT_11_X: + return ELF::R_HEX_GD_GOT_11_X; + case fixup_Hexagon_LD_GOT_32_6_X: + return ELF::R_HEX_LD_GOT_32_6_X; + case fixup_Hexagon_LD_GOT_16_X: + return ELF::R_HEX_LD_GOT_16_X; + case fixup_Hexagon_LD_GOT_11_X: + return ELF::R_HEX_LD_GOT_11_X; + case fixup_Hexagon_IE_32_6_X: + return ELF::R_HEX_IE_32_6_X; + case fixup_Hexagon_IE_16_X: + return ELF::R_HEX_IE_16_X; + case fixup_Hexagon_IE_GOT_32_6_X: + return ELF::R_HEX_IE_GOT_32_6_X; + case fixup_Hexagon_IE_GOT_16_X: + return ELF::R_HEX_IE_GOT_16_X; + case fixup_Hexagon_IE_GOT_11_X: + return ELF::R_HEX_IE_GOT_11_X; + case fixup_Hexagon_TPREL_32_6_X: + return ELF::R_HEX_TPREL_32_6_X; + case fixup_Hexagon_TPREL_16_X: + return ELF::R_HEX_TPREL_16_X; + case fixup_Hexagon_TPREL_11_X: + return ELF::R_HEX_TPREL_11_X; } - return Type; } MCObjectWriter *llvm::createHexagonELFObjectWriter(raw_pwrite_stream &OS, diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 1eee852..6f8cb90 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -342,6 +342,36 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI, return LastTargetFixupKind; } +namespace llvm { +extern const MCInstrDesc HexagonInsts[]; +} + +namespace { + bool isPCRel (unsigned Kind) { + switch(Kind){ + case fixup_Hexagon_B22_PCREL: + case fixup_Hexagon_B15_PCREL: + case fixup_Hexagon_B7_PCREL: + case fixup_Hexagon_B13_PCREL: + case fixup_Hexagon_B9_PCREL: + case fixup_Hexagon_B32_PCREL_X: + case fixup_Hexagon_B22_PCREL_X: + case fixup_Hexagon_B15_PCREL_X: + case fixup_Hexagon_B13_PCREL_X: + case fixup_Hexagon_B9_PCREL_X: + case fixup_Hexagon_B7_PCREL_X: + case fixup_Hexagon_32_PCREL: + case fixup_Hexagon_PLT_B22_PCREL: + case fixup_Hexagon_GD_PLT_B22_PCREL: + case fixup_Hexagon_LD_PLT_B22_PCREL: + case fixup_Hexagon_6_PCREL_X: + return true; + default: + return false; + } + } +} // namespace + unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO, const MCExpr *ME, @@ -363,7 +393,7 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI, Res = getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI); Res += getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI); - return Res; + return 0; } assert(MK == MCExpr::SymbolRef); @@ -662,8 +692,13 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI, break; } - MCFixup fixup = - MCFixup::create(*Addend, MO.getExpr(), MCFixupKind(FixupKind)); + MCExpr const *FixupExpression = (*Addend > 0 && isPCRel(FixupKind)) ? + MCBinaryExpr::createAdd(MO.getExpr(), + MCConstantExpr::create(*Addend, MCT), MCT) : + MO.getExpr(); + + MCFixup fixup = MCFixup::create(*Addend, FixupExpression, + MCFixupKind(FixupKind), MI.getLoc()); Fixups.push_back(fixup); // All of the information is in the fixup. return (0); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index 9aa258c..2a154da 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -44,8 +44,6 @@ public: uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB, MCInst const &MCI) const; - MCSubtargetInfo const &getSubtargetInfo() const; - void encodeInstruction(MCInst const &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, MCSubtargetInfo const &STI) const override; @@ -65,10 +63,6 @@ public: unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO, SmallVectorImpl<MCFixup> &Fixups, MCSubtargetInfo const &STI) const; - -private: - HexagonMCCodeEmitter(HexagonMCCodeEmitter const &) = delete; - void operator=(HexagonMCCodeEmitter const &) = delete; }; // class HexagonMCCodeEmitter } // namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 1080935..0d1f1e6 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -40,39 +40,39 @@ enum OpcodeIndex { tp1_jump_t }; -unsigned tstBitOpcode[8] = {J4_tstbit0_fp0_jump_nt, J4_tstbit0_fp0_jump_t, - J4_tstbit0_fp1_jump_nt, J4_tstbit0_fp1_jump_t, - J4_tstbit0_tp0_jump_nt, J4_tstbit0_tp0_jump_t, - J4_tstbit0_tp1_jump_nt, J4_tstbit0_tp1_jump_t}; -unsigned cmpeqBitOpcode[8] = {J4_cmpeq_fp0_jump_nt, J4_cmpeq_fp0_jump_t, - J4_cmpeq_fp1_jump_nt, J4_cmpeq_fp1_jump_t, - J4_cmpeq_tp0_jump_nt, J4_cmpeq_tp0_jump_t, - J4_cmpeq_tp1_jump_nt, J4_cmpeq_tp1_jump_t}; -unsigned cmpgtBitOpcode[8] = {J4_cmpgt_fp0_jump_nt, J4_cmpgt_fp0_jump_t, - J4_cmpgt_fp1_jump_nt, J4_cmpgt_fp1_jump_t, - J4_cmpgt_tp0_jump_nt, J4_cmpgt_tp0_jump_t, - J4_cmpgt_tp1_jump_nt, J4_cmpgt_tp1_jump_t}; -unsigned cmpgtuBitOpcode[8] = {J4_cmpgtu_fp0_jump_nt, J4_cmpgtu_fp0_jump_t, - J4_cmpgtu_fp1_jump_nt, J4_cmpgtu_fp1_jump_t, - J4_cmpgtu_tp0_jump_nt, J4_cmpgtu_tp0_jump_t, - J4_cmpgtu_tp1_jump_nt, J4_cmpgtu_tp1_jump_t}; -unsigned cmpeqiBitOpcode[8] = {J4_cmpeqi_fp0_jump_nt, J4_cmpeqi_fp0_jump_t, - J4_cmpeqi_fp1_jump_nt, J4_cmpeqi_fp1_jump_t, - J4_cmpeqi_tp0_jump_nt, J4_cmpeqi_tp0_jump_t, - J4_cmpeqi_tp1_jump_nt, J4_cmpeqi_tp1_jump_t}; -unsigned cmpgtiBitOpcode[8] = {J4_cmpgti_fp0_jump_nt, J4_cmpgti_fp0_jump_t, - J4_cmpgti_fp1_jump_nt, J4_cmpgti_fp1_jump_t, - J4_cmpgti_tp0_jump_nt, J4_cmpgti_tp0_jump_t, - J4_cmpgti_tp1_jump_nt, J4_cmpgti_tp1_jump_t}; -unsigned cmpgtuiBitOpcode[8] = {J4_cmpgtui_fp0_jump_nt, J4_cmpgtui_fp0_jump_t, - J4_cmpgtui_fp1_jump_nt, J4_cmpgtui_fp1_jump_t, - J4_cmpgtui_tp0_jump_nt, J4_cmpgtui_tp0_jump_t, - J4_cmpgtui_tp1_jump_nt, J4_cmpgtui_tp1_jump_t}; -unsigned cmpeqn1BitOpcode[8] = {J4_cmpeqn1_fp0_jump_nt, J4_cmpeqn1_fp0_jump_t, - J4_cmpeqn1_fp1_jump_nt, J4_cmpeqn1_fp1_jump_t, - J4_cmpeqn1_tp0_jump_nt, J4_cmpeqn1_tp0_jump_t, - J4_cmpeqn1_tp1_jump_nt, J4_cmpeqn1_tp1_jump_t}; -unsigned cmpgtn1BitOpcode[8] = { +static const unsigned tstBitOpcode[8] = { + J4_tstbit0_fp0_jump_nt, J4_tstbit0_fp0_jump_t, J4_tstbit0_fp1_jump_nt, + J4_tstbit0_fp1_jump_t, J4_tstbit0_tp0_jump_nt, J4_tstbit0_tp0_jump_t, + J4_tstbit0_tp1_jump_nt, J4_tstbit0_tp1_jump_t}; +static const unsigned cmpeqBitOpcode[8] = { + J4_cmpeq_fp0_jump_nt, J4_cmpeq_fp0_jump_t, J4_cmpeq_fp1_jump_nt, + J4_cmpeq_fp1_jump_t, J4_cmpeq_tp0_jump_nt, J4_cmpeq_tp0_jump_t, + J4_cmpeq_tp1_jump_nt, J4_cmpeq_tp1_jump_t}; +static const unsigned cmpgtBitOpcode[8] = { + J4_cmpgt_fp0_jump_nt, J4_cmpgt_fp0_jump_t, J4_cmpgt_fp1_jump_nt, + J4_cmpgt_fp1_jump_t, J4_cmpgt_tp0_jump_nt, J4_cmpgt_tp0_jump_t, + J4_cmpgt_tp1_jump_nt, J4_cmpgt_tp1_jump_t}; +static const unsigned cmpgtuBitOpcode[8] = { + J4_cmpgtu_fp0_jump_nt, J4_cmpgtu_fp0_jump_t, J4_cmpgtu_fp1_jump_nt, + J4_cmpgtu_fp1_jump_t, J4_cmpgtu_tp0_jump_nt, J4_cmpgtu_tp0_jump_t, + J4_cmpgtu_tp1_jump_nt, J4_cmpgtu_tp1_jump_t}; +static const unsigned cmpeqiBitOpcode[8] = { + J4_cmpeqi_fp0_jump_nt, J4_cmpeqi_fp0_jump_t, J4_cmpeqi_fp1_jump_nt, + J4_cmpeqi_fp1_jump_t, J4_cmpeqi_tp0_jump_nt, J4_cmpeqi_tp0_jump_t, + J4_cmpeqi_tp1_jump_nt, J4_cmpeqi_tp1_jump_t}; +static const unsigned cmpgtiBitOpcode[8] = { + J4_cmpgti_fp0_jump_nt, J4_cmpgti_fp0_jump_t, J4_cmpgti_fp1_jump_nt, + J4_cmpgti_fp1_jump_t, J4_cmpgti_tp0_jump_nt, J4_cmpgti_tp0_jump_t, + J4_cmpgti_tp1_jump_nt, J4_cmpgti_tp1_jump_t}; +static const unsigned cmpgtuiBitOpcode[8] = { + J4_cmpgtui_fp0_jump_nt, J4_cmpgtui_fp0_jump_t, J4_cmpgtui_fp1_jump_nt, + J4_cmpgtui_fp1_jump_t, J4_cmpgtui_tp0_jump_nt, J4_cmpgtui_tp0_jump_t, + J4_cmpgtui_tp1_jump_nt, J4_cmpgtui_tp1_jump_t}; +static const unsigned cmpeqn1BitOpcode[8] = { + J4_cmpeqn1_fp0_jump_nt, J4_cmpeqn1_fp0_jump_t, J4_cmpeqn1_fp1_jump_nt, + J4_cmpeqn1_fp1_jump_t, J4_cmpeqn1_tp0_jump_nt, J4_cmpeqn1_tp0_jump_t, + J4_cmpeqn1_tp1_jump_nt, J4_cmpeqn1_tp1_jump_t}; +static const unsigned cmpgtn1BitOpcode[8] = { J4_cmpgtn1_fp0_jump_nt, J4_cmpgtn1_fp0_jump_t, J4_cmpgtn1_fp1_jump_nt, J4_cmpgtn1_fp1_jump_t, J4_cmpgtn1_tp0_jump_nt, J4_cmpgtn1_tp0_jump_t, J4_cmpgtn1_tp1_jump_nt, J4_cmpgtn1_tp1_jump_t, @@ -174,7 +174,7 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { return HexagonII::HCG_None; } -} +} // namespace /// getCompoundOp - Return the index from 0-7 into the above opcode lists. namespace { @@ -199,7 +199,7 @@ unsigned getCompoundOp(MCInst const &HMCI) { return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t; } } -} +} // namespace namespace { MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { @@ -331,7 +331,7 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { return CompoundInsn; } -} +} // namespace /// Non-Symmetrical. See if these two instructions are fit for compound pair. namespace { @@ -348,7 +348,7 @@ bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA, return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) && (MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg())); } -} +} // namespace namespace { bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { @@ -396,7 +396,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { } return false; } -} +} // namespace /// tryCompound - Given a bundle check for compound insns when one /// is found update the contents fo the bundle with the compound insn. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index eb62977..7e9247c 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -394,8 +394,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src1Reg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) && - MCI.getOperand(2).isImm() && MCI.getOperand(2).isImm() && - isUInt<1>(MCI.getOperand(2).getImm())) { + MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) { return HexagonII::HSIG_S2; } break; diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp new file mode 100644 index 0000000..bf51c35 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -0,0 +1,152 @@ +//=== HexagonMCELFStreamer.cpp - Hexagon subclass of MCELFStreamer -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a stub that parses a MCInst bundle and passes the +// instructions on to the real streamer. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "hexagonmcelfstreamer" + +#include "Hexagon.h" +#include "HexagonMCELFStreamer.h" +#include "MCTargetDesc/HexagonBaseInfo.h" +#include "MCTargetDesc/HexagonMCShuffler.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<unsigned> + GPSize("gpsize", cl::NotHidden, + cl::desc("Global Pointer Addressing Size. The default size is 8."), + cl::Prefix, cl::init(8)); + +void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, + const MCSubtargetInfo &STI) { + MCInst HMI; + HMI.setOpcode(Hexagon::BUNDLE); + HMI.addOperand(MCOperand::createImm(0)); + MCInst *MCB; + + if (MCK.getOpcode() != Hexagon::BUNDLE) { + HMI.addOperand(MCOperand::createInst(&MCK)); + MCB = &HMI; + } else + MCB = const_cast<MCInst *>(&MCK); + + // Examines packet and pad the packet, if needed, when an + // end-loop is in the bundle. + HexagonMCInstrInfo::padEndloop(*MCB); + HexagonMCShuffle(*MCII, STI, *MCB); + + assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE); + bool Extended = false; + for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) { + MCInst *MCI = const_cast<MCInst *>(I.getInst()); + if (Extended) { + if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) { + MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst()); + HexagonMCInstrInfo::clampExtended(*MCII, *SubInst); + } else { + HexagonMCInstrInfo::clampExtended(*MCII, *MCI); + } + Extended = false; + } else { + Extended = HexagonMCInstrInfo::isImmext(*MCI); + } + } + + // At this point, MCB is a bundle + // Iterate through the bundle and assign addends for the instructions + for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) { + MCInst *MCI = const_cast<MCInst *>(I.getInst()); + EmitSymbol(*MCI); + } + MCObjectStreamer::EmitInstruction(*MCB, STI); +} + +void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) { + // Scan for values. + for (unsigned i = Inst.getNumOperands(); i--;) + if (Inst.getOperand(i).isExpr()) + visitUsedExpr(*Inst.getOperand(i).getExpr()); +} + +// EmitCommonSymbol and EmitLocalCommonSymbol are extended versions of the +// functions found in MCELFStreamer.cpp taking AccessSize as an additional +// parameter. +void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, + uint64_t Size, + unsigned ByteAlignment, + unsigned AccessSize) { + getAssembler().registerSymbol(*Symbol); + StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"}; + + auto ELFSymbol = cast<MCSymbolELF>(Symbol); + if (!ELFSymbol->isBindingSet()) { + ELFSymbol->setBinding(ELF::STB_GLOBAL); + ELFSymbol->setExternal(true); + } + + ELFSymbol->setType(ELF::STT_OBJECT); + + if (ELFSymbol->getBinding() == ELF::STB_LOCAL) { + StringRef SectionName = + ((AccessSize == 0) || (Size == 0) || (Size > GPSize)) + ? ".bss" + : sbss[(Log2_64(AccessSize))]; + + MCSection *CrntSection = getCurrentSection().first; + MCSection *Section = getAssembler().getContext().getELFSection( + SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + SwitchSection(Section); + AssignSection(Symbol, Section); + + MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment); + SwitchSection(CrntSection); + } else { + if (ELFSymbol->declareCommon(Size, ByteAlignment)) + report_fatal_error("Symbol: " + Symbol->getName() + + " redeclared as different type"); + if ((AccessSize) && (Size <= GPSize)) { + uint64_t SectionIndex = + (AccessSize <= GPSize) + ? ELF::SHN_HEXAGON_SCOMMON + (Log2_64(AccessSize) + 1) + : (unsigned)ELF::SHN_HEXAGON_SCOMMON; + ELFSymbol->setIndex(SectionIndex); + } + } + + ELFSymbol->setSize(MCConstantExpr::create(Size, getContext())); +} + +void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol( + MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment, + unsigned AccessSize) { + getAssembler().registerSymbol(*Symbol); + auto ELFSymbol = cast<MCSymbolELF>(Symbol); + ELFSymbol->setBinding(ELF::STB_LOCAL); + ELFSymbol->setExternal(false); + HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize); +} + +namespace llvm { +MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, MCCodeEmitter *CE) { + return new HexagonMCELFStreamer(Context, MAB, OS, CE); +} +} diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h new file mode 100644 index 0000000..d77c0cd --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h @@ -0,0 +1,45 @@ +//===- HexagonMCELFStreamer.h - Hexagon subclass of MCElfStreamer ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONMCELFSTREAMER_H +#define HEXAGONMCELFSTREAMER_H + +#include "MCTargetDesc/HexagonMCCodeEmitter.h" +#include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/MC/MCELFStreamer.h" +#include "HexagonTargetStreamer.h" + +namespace llvm { + +class HexagonMCELFStreamer : public MCELFStreamer { + std::unique_ptr<MCInstrInfo> MCII; + +public: + HexagonMCELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, MCCodeEmitter *Emitter) + : MCELFStreamer(Context, TAB, OS, Emitter), + MCII(createHexagonMCInstrInfo()) {} + + virtual void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override; + void EmitSymbol(const MCInst &Inst); + void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, + unsigned AccessSize); + void HexagonMCEmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, unsigned AccessSize); +}; + +MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, MCCodeEmitter *CE); + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index 2731278..e69a52d 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -35,6 +35,21 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) { return (1); } +void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) { + assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) || + HexagonMCInstrInfo::isExtended(MCII, MCI)); + MCOperand &exOp = + MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI)); + // If the extended value is a constant, then use it for the extended and + // for the extender instructions, masking off the lower 6 bits and + // including the assumed bits. + if (exOp.isImm()) { + unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI); + int64_t Bits = exOp.getImm(); + exOp.setImm((Bits & 0x3f) << Shift); + } +} + MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, MCInst const &inst1) { @@ -446,4 +461,4 @@ void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) { MCOperand &Operand = MCI.getOperand(0); Operand.setImm(Operand.getImm() | outerLoopMask); } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 09f305f..9f7562a 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -41,14 +41,14 @@ int64_t const outerLoopMask = 1 << outerLoopOffset; size_t const bundleInstructionsOffset = 1; -// Returns the number of instructions in the bundle -size_t bundleSize(MCInst const &MCI); - // Returns a iterator range of instructions in this bundle iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI); -// Return the extender for instruction at Index or nullptr if none -MCInst const *extenderForIndex(MCInst const &MCB, size_t Index); +// Returns the number of instructions in the bundle +size_t bundleSize(MCInst const &MCI); + +// Clamp off upper 26 bits of extendable operand for emission +void clampExtended(MCInstrInfo const &MCII, MCInst &MCI); // Create a duplex instruction given the two subinsts MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, @@ -57,6 +57,9 @@ MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, // Convert this instruction in to a duplex subinst MCInst deriveSubInst(MCInst const &Inst); +// Return the extender for instruction at Index or nullptr if none +MCInst const *extenderForIndex(MCInst const &MCB, size_t Index); + // Return memory access size HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI); @@ -224,9 +227,9 @@ void setOuterLoop(MCInst &MCI); // Would duplexing this instruction create a requirement to extend bool subInstWouldBeExtended(MCInst const &potentialDuplex); -// Attempt to find and replace compound pairs +// Attempt to find and replace compound pairs void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI); -} -} +} // namespace HexagonMCInstrInfo +} // namespace llvm #endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h index a21cce1..9c0e3f2 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h @@ -60,6 +60,6 @@ bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCContext &Context, MCInst &, SmallVector<DuplexCandidate, 8>); -} +} // namespace llvm #endif // HEXAGONMCSHUFFLER_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 43734ed..4a4f0c2 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -12,15 +12,20 @@ //===----------------------------------------------------------------------===// #include "HexagonMCTargetDesc.h" +#include "Hexagon.h" #include "HexagonMCAsmInfo.h" +#include "HexagonMCELFStreamer.h" #include "MCTargetDesc/HexagonInstPrinter.h" #include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" @@ -48,12 +53,92 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) { } static MCSubtargetInfo * -createHexagonMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { +createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitHexagonMCSubtargetInfo(X, TT, CPU, FS); return X; } +namespace { +class HexagonTargetAsmStreamer : public HexagonTargetStreamer { +public: + HexagonTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &, bool, + MCInstPrinter &) + : HexagonTargetStreamer(S) {} + void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS, + const MCInst &Inst, const MCSubtargetInfo &STI) override { + assert(HexagonMCInstrInfo::isBundle(Inst)); + assert(HexagonMCInstrInfo::bundleSize(Inst) <= HEXAGON_PACKET_SIZE); + std::string Buffer; + { + raw_string_ostream TempStream(Buffer); + InstPrinter.printInst(&Inst, TempStream, "", STI); + } + StringRef Contents(Buffer); + auto PacketBundle = Contents.rsplit('\n'); + auto HeadTail = PacketBundle.first.split('\n'); + auto Preamble = "\t{\n\t\t"; + auto Separator = ""; + while(!HeadTail.first.empty()) { + OS << Separator; + StringRef Inst; + auto Duplex = HeadTail.first.split('\v'); + if(!Duplex.second.empty()){ + OS << Duplex.first << "\n"; + Inst = Duplex.second; + } + else { + if(!HeadTail.first.startswith("immext")) + Inst = Duplex.first; + } + OS << Preamble; + OS << Inst; + HeadTail = HeadTail.second.split('\n'); + Preamble = ""; + Separator = "\n\t\t"; + } + if(HexagonMCInstrInfo::bundleSize(Inst) != 0) + OS << "\n\t}" << PacketBundle.second; + } +}; +} // namespace + +namespace { +class HexagonTargetELFStreamer : public HexagonTargetStreamer { +public: + MCELFStreamer &getStreamer() { + return static_cast<MCELFStreamer &>(Streamer); + } + HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI) + : HexagonTargetStreamer(S) { + auto Bits = STI.getFeatureBits(); + unsigned Flags; + if (Bits.to_ullong() & llvm::Hexagon::ArchV5) + Flags = ELF::EF_HEXAGON_MACH_V5; + else + Flags = ELF::EF_HEXAGON_MACH_V4; + getStreamer().getAssembler().setELFHeaderEFlags(Flags); + } + void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, + unsigned AccessSize) override { + HexagonMCELFStreamer &HexagonELFStreamer = + static_cast<HexagonMCELFStreamer &>(getStreamer()); + HexagonELFStreamer.HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, + AccessSize); + } + void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment, + unsigned AccessSize) override { + HexagonMCELFStreamer &HexagonELFStreamer = + static_cast<HexagonMCELFStreamer &>(getStreamer()); + HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol( + Symbol, Size, ByteAlignment, AccessSize); + } +}; +} // namespace + static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT) { MCAsmInfo *MAI = new HexagonMCAsmInfo(TT); @@ -82,9 +167,26 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { if (SyntaxVariant == 0) - return(new HexagonInstPrinter(MAI, MII, MRI)); + return (new HexagonInstPrinter(MAI, MII, MRI)); else - return nullptr; + return nullptr; +} + +MCTargetStreamer *createMCAsmTargetStreamer( + MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, + bool IsVerboseAsm) { + return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *InstPrint); +} + +static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + return createHexagonELFStreamer(Context, MAB, OS, Emitter); +} + +static MCTargetStreamer * +createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) { + return new HexagonTargetELFStreamer(S, STI); } // Force static initialization. @@ -116,7 +218,17 @@ extern "C" void LLVMInitializeHexagonTargetMC() { TargetRegistry::RegisterMCAsmBackend(TheHexagonTarget, createHexagonAsmBackend); + // Register the obj streamer + TargetRegistry::RegisterELFStreamer(TheHexagonTarget, createMCStreamer); + + // Register the asm streamer + TargetRegistry::RegisterAsmTargetStreamer(TheHexagonTarget, + createMCAsmTargetStreamer); + // Register the MC Inst Printer TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget, createHexagonMCInstPrinter); + + TargetRegistry::RegisterObjectTargetStreamer( + TheHexagonTarget, createHexagonObjectTargetStreamer); } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index 81211cc..89c3eb3 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -27,6 +27,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class Triple; class StringRef; class raw_ostream; class raw_pwrite_stream; @@ -42,13 +43,13 @@ MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII, MCContext &MCT); MCAsmBackend *createHexagonAsmBackend(Target const &T, - MCRegisterInfo const &MRI, StringRef TT, - StringRef CPU); + MCRegisterInfo const &MRI, + const Triple &TT, StringRef CPU); MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, StringRef CPU); -} // End llvm namespace +} // namespace llvm // Define symbolic names for Hexagon registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 9218fd3..53325f6 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -134,6 +134,6 @@ public: void setError(unsigned Err) { Error = Err; }; unsigned getError() const { return (Error); }; }; -} +} // namespace llvm #endif // HEXAGONSHUFFLER_H diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h index 70141a9..80565aa 100644 --- a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h +++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h @@ -40,6 +40,6 @@ namespace llvm { void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp index 6bcfb32..be445c5 100644 --- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp @@ -43,8 +43,8 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo * +createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitMSP430MCSubtargetInfo(X, TT, CPU, FS); return X; diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.h b/contrib/llvm/lib/Target/MSP430/MSP430.h index 796f252..302012e 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430.h @@ -30,7 +30,7 @@ namespace MSP430CC { COND_INVALID = -1 }; -} +} // namespace MSP430CC namespace llvm { class MSP430TargetMachine; @@ -42,6 +42,6 @@ namespace llvm { FunctionPass *createMSP430BranchSelectionPass(); -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp index ffcf222..2bc11c0 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp @@ -44,7 +44,7 @@ namespace { } }; char MSP430BSel::ID = 0; -} +} // namespace /// createMSP430BranchSelectionPass - returns an instance of the Branch /// Selection Pass diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h index 48c4dc86..2f20bbd 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h @@ -49,6 +49,6 @@ public: RegScavenger *RS = nullptr) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 5ce5013..a60108d 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -85,7 +85,7 @@ namespace { errs() << " JT" << JT << " Align" << Align << '\n'; } }; -} +} // namespace /// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine /// instructions for SelectionDAG operations. diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h index 80d3ae1..b090609 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h @@ -64,7 +64,7 @@ namespace llvm { /// SHL, SRA, SRL - Non-constant shifts. SHL, SRA, SRL }; - } + } // namespace MSP430ISD class MSP430Subtarget; class MSP430TargetLowering : public TargetLowering { diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 27681aa..72b1780 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -262,7 +262,7 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, unsigned MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h index f9b25b6..c6bad1e 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h @@ -38,7 +38,7 @@ namespace MSP430II { Size4Bytes = 3 << SizeShift, Size6Bytes = 4 << SizeShift }; -} +} // namespace MSP430II class MSP430InstrInfo : public MSP430GenInstrInfo { const MSP430RegisterInfo RI; @@ -82,12 +82,11 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h index ebd6397..ebbc6e5 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h @@ -42,6 +42,6 @@ public: MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h index fcc5f5b..3d1a245 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h @@ -49,6 +49,6 @@ public: void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h index 61a6b19..95c9293 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h @@ -26,6 +26,6 @@ public: ~MSP430SelectionDAGInfo(); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp index 3dda3bf..6374f41 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp @@ -31,7 +31,7 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { return *this; } -MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU, +MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h index 30d46d3..958a5d3 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h @@ -41,7 +41,7 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - MSP430Subtarget(const std::string &TT, const std::string &CPU, + MSP430Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM); MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); @@ -64,6 +64,6 @@ public: return &TSInfo; } }; -} // End llvm namespace +} // namespace llvm #endif // LLVM_TARGET_MSP430_SUBTARGET_H diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp index d6cc4ae..97a4047 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -25,7 +25,7 @@ extern "C" void LLVMInitializeMSP430Target() { RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target); } -MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT, +MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h index 6ccd30d..4f955a8 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h @@ -28,8 +28,8 @@ class MSP430TargetMachine : public LLVMTargetMachine { MSP430Subtarget Subtarget; public: - MSP430TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~MSP430TargetMachine() override; diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 9c054e5..5b8d633 100644 --- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -186,6 +186,10 @@ class MipsAsmParser : public MCTargetAsmParser { bool Is32BitImm, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg, + unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -197,10 +201,6 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - void expandLoadAddressSym(const MCOperand &DstRegOp, const MCOperand &SymOp, - bool Is32BitSym, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); - void expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd); @@ -208,6 +208,12 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + + bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + void createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -879,6 +885,9 @@ public: bool isConstantImm() const { return isImm() && dyn_cast<MCConstantExpr>(getImm()); } + template <unsigned Bits> bool isUImm() const { + return isImm() && isConstantImm() && isUInt<Bits>(getConstantImm()); + } bool isToken() const override { // Note: It's not possible to pretend that other operand kinds are tokens. // The matcher emitter checks tokens first. @@ -1616,6 +1625,16 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) { case Mips::SWM_MM: case Mips::JalOneReg: case Mips::JalTwoReg: + case Mips::BneImm: + case Mips::BeqImm: + case Mips::BLT: + case Mips::BLE: + case Mips::BGE: + case Mips::BGT: + case Mips::BLTU: + case Mips::BLEU: + case Mips::BGEU: + case Mips::BGTU: return true; default: return false; @@ -1642,6 +1661,18 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, case Mips::JalOneReg: case Mips::JalTwoReg: return expandJalWithRegs(Inst, IDLoc, Instructions); + case Mips::BneImm: + case Mips::BeqImm: + return expandBranchImm(Inst, IDLoc, Instructions); + case Mips::BLT: + case Mips::BLE: + case Mips::BGE: + case Mips::BGT: + case Mips::BLTU: + case Mips::BLEU: + case Mips::BGEU: + case Mips::BGTU: + return expandCondBranches(Inst, IDLoc, Instructions); } } @@ -1898,15 +1929,20 @@ MipsAsmParser::expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); + const MCOperand &SrcRegOp = Inst.getOperand(1); + assert(SrcRegOp.isReg() && "expected register operand kind"); + const MCOperand &ImmOp = Inst.getOperand(2); assert((ImmOp.isImm() || ImmOp.isExpr()) && "expected immediate operand kind"); if (!ImmOp.isImm()) { - expandLoadAddressSym(DstRegOp, ImmOp, Is32BitImm, IDLoc, Instructions); + if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), + SrcRegOp.getReg(), Is32BitImm, IDLoc, + Instructions)) + return true; + return false; } - const MCOperand &SrcRegOp = Inst.getOperand(1); - assert(SrcRegOp.isReg() && "expected register operand kind"); if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), SrcRegOp.getReg(), Is32BitImm, IDLoc, Instructions)) @@ -1925,7 +1961,11 @@ MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, assert((ImmOp.isImm() || ImmOp.isExpr()) && "expected immediate operand kind"); if (!ImmOp.isImm()) { - expandLoadAddressSym(DstRegOp, ImmOp, Is32BitImm, IDLoc, Instructions); + if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), + Mips::NoRegister, Is32BitImm, IDLoc, + Instructions)) + return true; + return false; } @@ -1936,8 +1976,8 @@ MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, return false; } -void MipsAsmParser::expandLoadAddressSym( - const MCOperand &DstRegOp, const MCOperand &SymOp, bool Is32BitSym, +bool MipsAsmParser::loadAndAddSymbolAddress( + const MCExpr *SymExpr, unsigned DstReg, unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { warnIfNoMacro(IDLoc); @@ -1945,14 +1985,12 @@ void MipsAsmParser::expandLoadAddressSym( Warning(IDLoc, "instruction loads the 32-bit address of a 64-bit symbol"); MCInst tmpInst; - unsigned RegNo = DstRegOp.getReg(); - const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymOp.getExpr()); - const MCSymbolRefExpr *HiExpr = - MCSymbolRefExpr::create(Symbol->getSymbol().getName(), - MCSymbolRefExpr::VK_Mips_ABS_HI, getContext()); - const MCSymbolRefExpr *LoExpr = - MCSymbolRefExpr::create(Symbol->getSymbol().getName(), - MCSymbolRefExpr::VK_Mips_ABS_LO, getContext()); + const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymExpr); + const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create( + &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_HI, getContext()); + const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create( + &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_LO, getContext()); + if (!Is32BitSym) { // If it's a 64-bit architecture, expand to: // la d,sym => lui d,highest(sym) @@ -1961,36 +1999,39 @@ void MipsAsmParser::expandLoadAddressSym( // ori d,d,hi16(sym) // dsll d,d,16 // ori d,d,lo16(sym) - const MCSymbolRefExpr *HighestExpr = - MCSymbolRefExpr::create(Symbol->getSymbol().getName(), - MCSymbolRefExpr::VK_Mips_HIGHEST, getContext()); - const MCSymbolRefExpr *HigherExpr = - MCSymbolRefExpr::create(Symbol->getSymbol().getName(), - MCSymbolRefExpr::VK_Mips_HIGHER, getContext()); + const MCSymbolRefExpr *HighestExpr = MCSymbolRefExpr::create( + &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHEST, getContext()); + const MCSymbolRefExpr *HigherExpr = MCSymbolRefExpr::create( + &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHER, getContext()); tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(RegNo)); + tmpInst.addOperand(MCOperand::createReg(DstReg)); tmpInst.addOperand(MCOperand::createExpr(HighestExpr)); Instructions.push_back(tmpInst); - createLShiftOri<0>(MCOperand::createExpr(HigherExpr), RegNo, SMLoc(), + createLShiftOri<0>(MCOperand::createExpr(HigherExpr), DstReg, SMLoc(), Instructions); - createLShiftOri<16>(MCOperand::createExpr(HiExpr), RegNo, SMLoc(), + createLShiftOri<16>(MCOperand::createExpr(HiExpr), DstReg, SMLoc(), Instructions); - createLShiftOri<16>(MCOperand::createExpr(LoExpr), RegNo, SMLoc(), + createLShiftOri<16>(MCOperand::createExpr(LoExpr), DstReg, SMLoc(), Instructions); } else { // Otherwise, expand to: // la d,sym => lui d,hi16(sym) // ori d,d,lo16(sym) tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(RegNo)); + tmpInst.addOperand(MCOperand::createReg(DstReg)); tmpInst.addOperand(MCOperand::createExpr(HiExpr)); Instructions.push_back(tmpInst); - createLShiftOri<0>(MCOperand::createExpr(LoExpr), RegNo, SMLoc(), + createLShiftOri<0>(MCOperand::createExpr(LoExpr), DstReg, SMLoc(), Instructions); } + + if (SrcReg != Mips::NoRegister) + createAddu(DstReg, DstReg, SrcReg, Instructions); + + return false; } bool MipsAsmParser::expandUncondBranchMMPseudo( @@ -2032,10 +2073,62 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( return false; } +bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + const MCOperand &DstRegOp = Inst.getOperand(0); + assert(DstRegOp.isReg() && "expected register operand kind"); + + const MCOperand &ImmOp = Inst.getOperand(1); + assert(ImmOp.isImm() && "expected immediate operand kind"); + + const MCOperand &MemOffsetOp = Inst.getOperand(2); + assert(MemOffsetOp.isImm() && "expected immediate operand kind"); + + unsigned OpCode = 0; + switch(Inst.getOpcode()) { + case Mips::BneImm: + OpCode = Mips::BNE; + break; + case Mips::BeqImm: + OpCode = Mips::BEQ; + break; + default: + llvm_unreachable("Unknown immediate branch pseudo-instruction."); + break; + } + + int64_t ImmValue = ImmOp.getImm(); + if (ImmValue == 0) { + MCInst BranchInst; + BranchInst.setOpcode(OpCode); + BranchInst.addOperand(DstRegOp); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MemOffsetOp); + Instructions.push_back(BranchInst); + } else { + warnIfNoMacro(IDLoc); + + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), IDLoc, + Instructions)) + return true; + + MCInst BranchInst; + BranchInst.setOpcode(OpCode); + BranchInst.addOperand(DstRegOp); + BranchInst.addOperand(MCOperand::createReg(ATReg)); + BranchInst.addOperand(MemOffsetOp); + Instructions.push_back(BranchInst); + } + return false; +} + void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) { - const MCSymbolRefExpr *SR; MCInst TempInst; unsigned ImmOffset, HiOffset, LoOffset; const MCExpr *ExprOffset; @@ -2102,16 +2195,8 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, if (isImmOpnd) TempInst.addOperand(MCOperand::createImm(HiOffset)); else { - if (ExprOffset->getKind() == MCExpr::SymbolRef) { - SR = static_cast<const MCSymbolRefExpr *>(ExprOffset); - const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create( - SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI, - getContext()); - TempInst.addOperand(MCOperand::createExpr(HiExpr)); - } else { - const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi"); - TempInst.addOperand(MCOperand::createExpr(HiExpr)); - } + const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi"); + TempInst.addOperand(MCOperand::createExpr(HiExpr)); } // Add the instruction to the list. Instructions.push_back(TempInst); @@ -2134,15 +2219,8 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, if (isImmOpnd) TempInst.addOperand(MCOperand::createImm(LoOffset)); else { - if (ExprOffset->getKind() == MCExpr::SymbolRef) { - const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create( - SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_LO, - getContext()); - TempInst.addOperand(MCOperand::createExpr(LoExpr)); - } else { - const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo"); - TempInst.addOperand(MCOperand::createExpr(LoExpr)); - } + const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo"); + TempInst.addOperand(MCOperand::createExpr(LoExpr)); } Instructions.push_back(TempInst); TempInst.clear(); @@ -2171,6 +2249,206 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, return false; } +bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + unsigned PseudoOpcode = Inst.getOpcode(); + unsigned SrcReg = Inst.getOperand(0).getReg(); + unsigned TrgReg = Inst.getOperand(1).getReg(); + const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr(); + + unsigned ZeroSrcOpcode, ZeroTrgOpcode; + bool ReverseOrderSLT, IsUnsigned, AcceptsEquality; + + switch (PseudoOpcode) { + case Mips::BLT: + case Mips::BLTU: + AcceptsEquality = false; + ReverseOrderSLT = false; + IsUnsigned = (PseudoOpcode == Mips::BLTU); + ZeroSrcOpcode = Mips::BGTZ; + ZeroTrgOpcode = Mips::BLTZ; + break; + case Mips::BLE: + case Mips::BLEU: + AcceptsEquality = true; + ReverseOrderSLT = true; + IsUnsigned = (PseudoOpcode == Mips::BLEU); + ZeroSrcOpcode = Mips::BGEZ; + ZeroTrgOpcode = Mips::BLEZ; + break; + case Mips::BGE: + case Mips::BGEU: + AcceptsEquality = true; + ReverseOrderSLT = false; + IsUnsigned = (PseudoOpcode == Mips::BGEU); + ZeroSrcOpcode = Mips::BLEZ; + ZeroTrgOpcode = Mips::BGEZ; + break; + case Mips::BGT: + case Mips::BGTU: + AcceptsEquality = false; + ReverseOrderSLT = true; + IsUnsigned = (PseudoOpcode == Mips::BGTU); + ZeroSrcOpcode = Mips::BLTZ; + ZeroTrgOpcode = Mips::BGTZ; + break; + default: + llvm_unreachable("unknown opcode for branch pseudo-instruction"); + } + + MCInst BranchInst; + bool IsTrgRegZero = (TrgReg == Mips::ZERO); + bool IsSrcRegZero = (SrcReg == Mips::ZERO); + if (IsSrcRegZero && IsTrgRegZero) { + // FIXME: All of these Opcode-specific if's are needed for compatibility + // with GAS' behaviour. However, they may not generate the most efficient + // code in some circumstances. + if (PseudoOpcode == Mips::BLT) { + BranchInst.setOpcode(Mips::BLTZ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; + } + if (PseudoOpcode == Mips::BLE) { + BranchInst.setOpcode(Mips::BLEZ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + Warning(IDLoc, "branch is always taken"); + return false; + } + if (PseudoOpcode == Mips::BGE) { + BranchInst.setOpcode(Mips::BGEZ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + Warning(IDLoc, "branch is always taken"); + return false; + } + if (PseudoOpcode == Mips::BGT) { + BranchInst.setOpcode(Mips::BGTZ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; + } + if (PseudoOpcode == Mips::BGTU) { + BranchInst.setOpcode(Mips::BNE); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; + } + if (AcceptsEquality) { + // If both registers are $0 and the pseudo-branch accepts equality, it + // will always be taken, so we emit an unconditional branch. + BranchInst.setOpcode(Mips::BEQ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + Warning(IDLoc, "branch is always taken"); + return false; + } + // If both registers are $0 and the pseudo-branch does not accept + // equality, it will never be taken, so we don't have to emit anything. + return false; + } + if (IsSrcRegZero || IsTrgRegZero) { + if ((IsSrcRegZero && PseudoOpcode == Mips::BGTU) || + (IsTrgRegZero && PseudoOpcode == Mips::BLTU)) { + // If the $rs is $0 and the pseudo-branch is BGTU (0 > x) or + // if the $rt is $0 and the pseudo-branch is BLTU (x < 0), + // the pseudo-branch will never be taken, so we don't emit anything. + // This only applies to unsigned pseudo-branches. + return false; + } + if ((IsSrcRegZero && PseudoOpcode == Mips::BLEU) || + (IsTrgRegZero && PseudoOpcode == Mips::BGEU)) { + // If the $rs is $0 and the pseudo-branch is BLEU (0 <= x) or + // if the $rt is $0 and the pseudo-branch is BGEU (x >= 0), + // the pseudo-branch will always be taken, so we emit an unconditional + // branch. + // This only applies to unsigned pseudo-branches. + BranchInst.setOpcode(Mips::BEQ); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + Warning(IDLoc, "branch is always taken"); + return false; + } + if (IsUnsigned) { + // If the $rs is $0 and the pseudo-branch is BLTU (0 < x) or + // if the $rt is $0 and the pseudo-branch is BGTU (x > 0), + // the pseudo-branch will be taken only when the non-zero register is + // different from 0, so we emit a BNEZ. + // + // If the $rs is $0 and the pseudo-branch is BGEU (0 >= x) or + // if the $rt is $0 and the pseudo-branch is BLEU (x <= 0), + // the pseudo-branch will be taken only when the non-zero register is + // equal to 0, so we emit a BEQZ. + // + // Because only BLEU and BGEU branch on equality, we can use the + // AcceptsEquality variable to decide when to emit the BEQZ. + BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); + BranchInst.addOperand( + MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; + } + // If we have a signed pseudo-branch and one of the registers is $0, + // we can use an appropriate compare-to-zero branch. We select which one + // to use in the switch statement above. + BranchInst.setOpcode(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode); + BranchInst.addOperand(MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; + } + + // If neither the SrcReg nor the TrgReg are $0, we need AT to perform the + // expansions. If it is not available, we return. + unsigned ATRegNum = getATReg(IDLoc); + if (!ATRegNum) + return true; + + warnIfNoMacro(IDLoc); + + // SLT fits well with 2 of our 4 pseudo-branches: + // BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and + // BGT, where $rs > $rt, translates into "slt $at, $rt, $rs". + // If the result of the SLT is 1, we branch, and if it's 0, we don't. + // This is accomplished by using a BNEZ with the result of the SLT. + // + // The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT + // and BLE with BGT), so we change the BNEZ into a a BEQZ. + // Because only BGE and BLE branch on equality, we can use the + // AcceptsEquality variable to decide when to emit the BEQZ. + // Note that the order of the SLT arguments doesn't change between + // opposites. + // + // The same applies to the unsigned variants, except that SLTu is used + // instead of SLT. + MCInst SetInst; + SetInst.setOpcode(IsUnsigned ? Mips::SLTu : Mips::SLT); + SetInst.addOperand(MCOperand::createReg(ATRegNum)); + SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? TrgReg : SrcReg)); + SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? SrcReg : TrgReg)); + Instructions.push_back(SetInst); + + BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); + BranchInst.addOperand(MCOperand::createReg(ATRegNum)); + BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); + BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); + Instructions.push_back(BranchInst); + return false; +} + void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { MCInst NopInst; @@ -2572,7 +2850,7 @@ const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr, if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(Expr)) { // It's a symbol, create a symbolic expression from the symbol. - StringRef Symbol = MSRE->getSymbol().getName(); + const MCSymbol *Symbol = &MSRE->getSymbol(); MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr); Res = MCSymbolRefExpr::create(Symbol, VK, getContext()); return Res; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp index 70b9cca..725ea7f 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp @@ -66,4 +66,4 @@ MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) { OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4); // flags2 return OS; } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h index b078cd3..bf306ee 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h @@ -186,6 +186,6 @@ public: }; MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp index bf8f7d1..8e6c9e6 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp @@ -47,7 +47,7 @@ unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const { llvm_unreachable("Unhandled ABI"); } -MipsABIInfo MipsABIInfo::computeTargetABI(Triple TT, StringRef CPU, +MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { if (Options.getABIName().startswith("o32")) return MipsABIInfo::O32(); diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h index d20dc90..aa965e82a 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h @@ -36,7 +36,7 @@ public: static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); } static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); } static MipsABIInfo EABI() { return MipsABIInfo(ABI::EABI); } - static MipsABIInfo computeTargetABI(Triple TT, StringRef CPU, + static MipsABIInfo computeTargetABI(const Triple &TT, StringRef CPU, const MCTargetOptions &Options); bool IsKnown() const { return ThisABI != ABI::Unknown; } @@ -73,6 +73,6 @@ public: unsigned GetEhDataReg(unsigned I) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index d823ffc..5c746b2 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -417,32 +417,27 @@ void MipsAsmBackend::processFixupValue(const MCAssembler &Asm, // MCAsmBackend MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new MipsAsmBackend(T, Triple(TT).getOS(), - /*IsLittle*/true, /*Is64Bit*/false); + const Triple &TT, StringRef CPU) { + return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true, + /*Is64Bit*/ false); } MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new MipsAsmBackend(T, Triple(TT).getOS(), - /*IsLittle*/false, /*Is64Bit*/false); + const Triple &TT, StringRef CPU) { + return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false, + /*Is64Bit*/ false); } MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new MipsAsmBackend(T, Triple(TT).getOS(), - /*IsLittle*/true, /*Is64Bit*/true); + const Triple &TT, StringRef CPU) { + return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true, /*Is64Bit*/ true); } MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new MipsAsmBackend(T, Triple(TT).getOS(), - /*IsLittle*/false, /*Is64Bit*/true); + const Triple &TT, StringRef CPU) { + return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false, + /*Is64Bit*/ true); } diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index b3d5a49..fe84e40 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -87,6 +87,6 @@ public: }; // class MipsAsmBackend -} // namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h index ff7779e..a7d5a1e 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h @@ -119,7 +119,7 @@ namespace MipsII { FormMask = 15 }; -} -} +} // namespace MipsII +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 982a7f5..a45e2ad 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -51,7 +51,7 @@ struct MipsRelocationEntry { virtual void sortRelocs(const MCAssembler &Asm, std::vector<ELFRelocationEntry> &Relocs) override; }; -} +} // namespace MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64, bool IsLittleEndian) @@ -64,13 +64,47 @@ MipsELFObjectWriter::~MipsELFObjectWriter() {} unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // determine the type of the relocation + // Determine the type of the relocation. unsigned Kind = (unsigned)Fixup.getKind(); switch (Kind) { + case Mips::fixup_Mips_16: + case FK_Data_2: + return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16; case Mips::fixup_Mips_32: case FK_Data_4: return IsPCRel ? ELF::R_MIPS_PC32 : ELF::R_MIPS_32; + } + + if (IsPCRel) { + switch (Kind) { + case Mips::fixup_Mips_Branch_PCRel: + case Mips::fixup_Mips_PC16: + return ELF::R_MIPS_PC16; + case Mips::fixup_MICROMIPS_PC7_S1: + return ELF::R_MICROMIPS_PC7_S1; + case Mips::fixup_MICROMIPS_PC10_S1: + return ELF::R_MICROMIPS_PC10_S1; + case Mips::fixup_MICROMIPS_PC16_S1: + return ELF::R_MICROMIPS_PC16_S1; + case Mips::fixup_MIPS_PC19_S2: + return ELF::R_MIPS_PC19_S2; + case Mips::fixup_MIPS_PC18_S3: + return ELF::R_MIPS_PC18_S3; + case Mips::fixup_MIPS_PC21_S2: + return ELF::R_MIPS_PC21_S2; + case Mips::fixup_MIPS_PC26_S2: + return ELF::R_MIPS_PC26_S2; + case Mips::fixup_MIPS_PCHI16: + return ELF::R_MIPS_PCHI16; + case Mips::fixup_MIPS_PCLO16: + return ELF::R_MIPS_PCLO16; + } + + llvm_unreachable("invalid PC-relative fixup kind!"); + } + + switch (Kind) { case Mips::fixup_Mips_64: case FK_Data_8: return ELF::R_MIPS_64; @@ -110,9 +144,6 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_MIPS_TLS_DTPREL_HI16; case Mips::fixup_Mips_DTPREL_LO: return ELF::R_MIPS_TLS_DTPREL_LO16; - case Mips::fixup_Mips_Branch_PCRel: - case Mips::fixup_Mips_PC16: - return ELF::R_MIPS_PC16; case Mips::fixup_Mips_GOT_PAGE: return ELF::R_MIPS_GOT_PAGE; case Mips::fixup_Mips_GOT_OFST: @@ -153,12 +184,6 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_MICROMIPS_LO16; case Mips::fixup_MICROMIPS_GOT16: return ELF::R_MICROMIPS_GOT16; - case Mips::fixup_MICROMIPS_PC7_S1: - return ELF::R_MICROMIPS_PC7_S1; - case Mips::fixup_MICROMIPS_PC10_S1: - return ELF::R_MICROMIPS_PC10_S1; - case Mips::fixup_MICROMIPS_PC16_S1: - return ELF::R_MICROMIPS_PC16_S1; case Mips::fixup_MICROMIPS_CALL16: return ELF::R_MICROMIPS_CALL16; case Mips::fixup_MICROMIPS_GOT_DISP: @@ -179,19 +204,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_MICROMIPS_TLS_TPREL_HI16; case Mips::fixup_MICROMIPS_TLS_TPREL_LO16: return ELF::R_MICROMIPS_TLS_TPREL_LO16; - case Mips::fixup_MIPS_PC19_S2: - return ELF::R_MIPS_PC19_S2; - case Mips::fixup_MIPS_PC18_S3: - return ELF::R_MIPS_PC18_S3; - case Mips::fixup_MIPS_PC21_S2: - return ELF::R_MIPS_PC21_S2; - case Mips::fixup_MIPS_PC26_S2: - return ELF::R_MIPS_PC26_S2; - case Mips::fixup_MIPS_PCHI16: - return ELF::R_MIPS_PCHI16; - case Mips::fixup_MIPS_PCLO16: - return ELF::R_MIPS_PCLO16; } + llvm_unreachable("invalid fixup kind!"); } diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h index 687b800..81a0a98 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h @@ -25,6 +25,6 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg); MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS, MCCodeEmitter *Emitter, bool RelaxAll); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 54d8863..9bdf8235 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -43,11 +43,9 @@ using namespace llvm; /// Select the Mips CPU for the given triple and cpu name. /// FIXME: Merge with the copy in MipsSubtarget.cpp -StringRef MIPS_MC::selectMipsCPU(StringRef TT, StringRef CPU) { +StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) { if (CPU.empty() || CPU == "generic") { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::mips || - TheTriple.getArch() == Triple::mipsel) + if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel) CPU = "mips32"; else CPU = "mips64"; @@ -67,8 +65,8 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { CPU = MIPS_MC::selectMipsCPU(TT, CPU); MCSubtargetInfo *X = new MCSubtargetInfo(); InitMipsMCSubtargetInfo(X, TT, CPU, FS); diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 577a8b3..20358a0 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -26,6 +26,7 @@ class MCRegisterInfo; class MCSubtargetInfo; class StringRef; class Target; +class Triple; class raw_ostream; class raw_pwrite_stream; @@ -42,26 +43,26 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createMipsAsmBackendEB32(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCAsmBackend *createMipsAsmBackendEL32(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCAsmBackend *createMipsAsmBackendEB64(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCAsmBackend *createMipsAsmBackendEL64(const Target &T, - const MCRegisterInfo &MRI, StringRef TT, - StringRef CPU); + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); MCObjectWriter *createMipsELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, bool IsLittleEndian, bool Is64Bit); namespace MIPS_MC { -StringRef selectMipsCPU(StringRef TT, StringRef CPU); +StringRef selectMipsCPU(const Triple &TT, StringRef CPU); } -} // End llvm namespace +} // namespace llvm // Defines symbolic names for Mips registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index aef9bd3..5378675 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -265,4 +265,4 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, return S; } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td index 7350b97..78ba76d 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -221,3 +221,22 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst { let Inst{20-16} = rt; let Inst{15-0} = offset; } + +class ERET_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> { + bits<32> Inst; + + let Inst{31-26} = 0x00; + let Inst{25-16} = 0x00; + let Inst{15-6} = 0x3cd; + let Inst{5-0} = 0x3c; +} + +class ERETNC_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> { + bits<32> Inst; + + let Inst{31-26} = 0x00; + let Inst{25-17} = 0x00; + let Inst{16-16} = 0x01; + let Inst{15-6} = 0x3cd; + let Inst{5-0} = 0x3c; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td index 2259d5d..ed71c3d 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -40,6 +40,8 @@ class CLO_MMR6_ENC : POOL32A_2R_FM_MMR6<0b0100101100>; class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>; class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>; class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>; +class ERET_MMR6_ENC : ERET_FM_MMR6<"eret">; +class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">; class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>; class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>; class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>; @@ -164,6 +166,9 @@ class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> class CLO_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clo", GPR32Opnd>; class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd>; +class ERET_MMR6_DESC : ER_FT<"eret">; +class ERETNC_MMR6_DESC : ER_FT<"eretnc">; + class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd, RegisterOperand GPROpnd> : MMR6Arch<opstr> { @@ -302,6 +307,9 @@ def CLO_MMR6 : R6MMR6Rel, CLO_MMR6_ENC, CLO_MMR6_DESC, ISA_MICROMIPS32R6; def CLZ_MMR6 : R6MMR6Rel, CLZ_MMR6_ENC, CLZ_MMR6_DESC, ISA_MICROMIPS32R6; def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6; def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6; +def ERET_MMR6 : R6MMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6; +def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC, + ISA_MICROMIPS32R6; def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6; def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6; def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6; diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h index 671d7a8..604b670 100644 --- a/contrib/llvm/lib/Target/Mips/Mips.h +++ b/contrib/llvm/lib/Target/Mips/Mips.h @@ -31,6 +31,6 @@ namespace llvm { FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM); FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm); -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h index f281c92..2c33cfb 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h +++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h @@ -42,6 +42,6 @@ public: RegScavenger *RS) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp index 893fc7c..f2831fd 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp @@ -62,7 +62,7 @@ namespace { }; char Mips16HardFloat::ID = 0; -} +} // namespace // // Return types that matter for hard float are: diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp index 2eb6e5d..bf82108 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp @@ -46,5 +46,5 @@ extern FuncSignature const *findFuncSignature(const char *name) { } return nullptr; } -} -} +} // namespace Mips16HardFloatInfo +} // namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h index 7295c28..8354c33 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h +++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h @@ -44,7 +44,7 @@ struct FuncNameSignature { extern const FuncNameSignature PredefinedFuncs[]; extern FuncSignature const *findFuncSignature(const char *name); -} -} +} // namespace Mips16HardFloatInfo +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h index ae0e61e..ce6b3f8 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h @@ -48,6 +48,6 @@ private: FunctionPass *createMips16ISelDag(MipsTargetMachine &TM); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp index 846e3c9..c52ef2a 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp @@ -54,7 +54,7 @@ struct Mips16IntrinsicHelperType{ return std::strcmp(Name, RHS.Name) == 0; } }; -} +} // namespace // Libcalls for which no helper is generated. Sorted by name for binary search. static const Mips16Libcall HardFloatLibCalls[] = { diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h index d3b9f75..99d3cac 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h @@ -77,6 +77,6 @@ namespace llvm { unsigned SltiOpc, unsigned SltiXOpc, MachineInstr *MI, MachineBasicBlock *BB )const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h index 6540b40..1132d8a 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h @@ -123,6 +123,6 @@ private: }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td index 8a27874..83781ff 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -27,8 +27,6 @@ def uimm16_64 : Operand<i64> { // Signed Operand def simm10_64 : Operand<i64>; -def imm64: Operand<i64>; - // Transformation Function - get Imm - 32. def Subtract32 : SDNodeXForm<imm, [{ return getImm(N, (unsigned)N->getZExtValue() - 32); diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h index ae3c38c..6b5d02b 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h +++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h @@ -58,6 +58,6 @@ namespace llvm { unsigned ADDiu, ORi, SLL, LUi; InstSeq Insts; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index f84666b..1c80021 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -694,9 +694,8 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { // clean anyhow. // FIXME: For ifunc related functions we could iterate over and look // for a feature string that doesn't match the default one. - StringRef TT = TM.getTargetTriple(); - StringRef CPU = - MIPS_MC::selectMipsCPU(TM.getTargetTriple(), TM.getTargetCPU()); + const Triple &TT = TM.getTargetTriple(); + StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); StringRef FS = TM.getTargetFeatureString(); const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM); const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM); @@ -900,7 +899,8 @@ void MipsAsmPrinter::EmitFPCallStub( // freed) and since we're at the global level we can use the default // constructed subtarget. std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( - TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString())); + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); // // .global xxxx diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h index a7f3304..3c2b843 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h +++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h @@ -145,7 +145,7 @@ public: void EmitEndOfAsmFile(Module &M) override; void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.h b/contrib/llvm/lib/Target/Mips/MipsCCState.h index 081c393..04a9ef5 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCCState.h +++ b/contrib/llvm/lib/Target/Mips/MipsCCState.h @@ -131,6 +131,6 @@ public: bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; } SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h index 5eabd58..dab9c05 100644 --- a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h @@ -49,6 +49,6 @@ protected: const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST); const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST); -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h index 1426d0f..83be74f 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h @@ -129,6 +129,6 @@ private: unsigned ConstraintID, std::vector<SDValue> &OutOps) override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index bc9a1ce..e4f3cde 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -204,7 +204,7 @@ namespace llvm { SDL, SDR }; - } + } // namespace MipsISD //===--------------------------------------------------------------------===// // TargetLowering Implementation @@ -566,6 +566,6 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp index 0839147..bb23cc0 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -96,8 +96,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - DebugLoc DL, - const SmallVectorImpl<MachineOperand> &Cond) const { + DebugLoc DL, ArrayRef<MachineOperand> Cond) const { unsigned Opc = Cond[0].getImm(); const MCInstrDesc &MCID = get(Opc); MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID); @@ -115,7 +114,7 @@ MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, unsigned MipsInstrInfo::InsertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const { + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h index 4589535..3daff5f 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -59,8 +59,7 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; bool @@ -140,13 +139,13 @@ private: SmallVectorImpl<MachineOperand> &Cond) const; void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, - const SmallVectorImpl<MachineOperand>& Cond) const; + ArrayRef<MachineOperand> Cond) const; }; /// Create MipsInstrInfo objects. const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI); const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td index 58791cf..2a7949e 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -358,6 +358,8 @@ def calltarget : Operand<iPTR> { let ParserMatchClass = MipsJumpTargetAsmOperand; } +def imm64: Operand<i64>; + def simm9 : Operand<i32>; def simm10 : Operand<i32>; def simm11 : Operand<i32>; @@ -384,7 +386,15 @@ def simm20 : Operand<i32> { def uimm20 : Operand<i32> { } +def MipsUImm10AsmOperand : AsmOperandClass { + let Name = "UImm10"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseImm"; + let PredicateMethod = "isUImm<10>"; +} + def uimm10 : Operand<i32> { + let ParserMatchClass = MipsUImm10AsmOperand; } def simm16_64 : Operand<i64> { @@ -1273,7 +1283,9 @@ def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>; def TRAP : TrapBase<BREAK>; def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6; +let AdditionalPredicates = [NotInMicroMips] in { def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32; +} def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32; def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2; @@ -1672,6 +1684,29 @@ def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs), def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs), "jal\t$rs"> ; +let hasDelaySlot = 1 in { +def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), + (ins imm64:$imm64, brtarget:$offset), + "bne\t$rt, $imm64, $offset">; +def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), + (ins imm64:$imm64, brtarget:$offset), + "beq\t$rt, $imm64, $offset">; + +class CondBranchPseudo<string instr_asm> : + MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, + brtarget:$offset), + !strconcat(instr_asm, "\t$rs, $rt, $offset")>; +} + +def BLT : CondBranchPseudo<"blt">; +def BLE : CondBranchPseudo<"ble">; +def BGE : CondBranchPseudo<"bge">; +def BGT : CondBranchPseudo<"bgt">; +def BLTU : CondBranchPseudo<"bltu">; +def BLEU : CondBranchPseudo<"bleu">; +def BGEU : CondBranchPseudo<"bgeu">; +def BGTU : CondBranchPseudo<"bgtu">; + //===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h index 1ce27e4..a8bd1cd 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h +++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h @@ -45,6 +45,6 @@ private: MCSymbolRefExpr::VariantKind Kind) const; bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp index b18a673..8568137 100644 --- a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp @@ -37,7 +37,7 @@ namespace { }; char MipsModuleDAGToDAGISel::ID = 0; -} +} // namespace bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n"); diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp index b6cd791..5c71272 100644 --- a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp @@ -43,7 +43,7 @@ namespace { }; char MipsOs16::ID = 0; -} +} // namespace // Figure out if we need float point based on the function signature. // We need to move variables in and/or out of floating point diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index ec7bf31..a858f30 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -75,7 +75,7 @@ private: const MipsSEInstrInfo &TII; const MipsRegisterInfo &RegInfo; }; -} +} // namespace ExpandPseudo::ExpandPseudo(MachineFunction &MF_) : MF(MF_), MRI(MF.getRegInfo()), diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h index 2fcd6bb..ee56b8b 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h @@ -39,6 +39,6 @@ public: unsigned ehDataReg(unsigned I) const; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h index a894034..fb2f041 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -126,6 +126,6 @@ private: FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h index d44f8d8..623630a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h @@ -112,6 +112,6 @@ namespace llvm { MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI, MachineBasicBlock *BB) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h index bebbabf..cdafe9f 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -113,6 +113,6 @@ private: MachineBasicBlock::iterator I) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h index 061423f..feddf98 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h @@ -26,6 +26,6 @@ public: ~MipsSelectionDAGInfo(); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp index 7ea10eb..c41bb16 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp @@ -59,7 +59,7 @@ static cl::opt<bool> void MipsSubtarget::anchor() { } -MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, +MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, bool little, const MipsTargetMachine &TM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault), @@ -126,7 +126,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, } /// This overrides the PostRAScheduler bit in the SchedModel for any CPU. -bool MipsSubtarget::enablePostMachineScheduler() const { return true; } +bool MipsSubtarget::enablePostRAScheduler() const { return true; } void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { CriticalPathRCs.clear(); diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h index 0bfafc8..c8a2e4b 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h @@ -147,7 +147,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { public: /// This overrides the PostRAScheduler bit in the SchedModel for each CPU. - bool enablePostMachineScheduler() const override; + bool enablePostRAScheduler() const override; void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override; @@ -161,9 +161,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. - MipsSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool little, - const MipsTargetMachine &TM); + MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + bool little, const MipsTargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. @@ -293,6 +292,6 @@ public: return &InstrItins; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp index b279184..c820668 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -44,12 +44,11 @@ extern "C" void LLVMInitializeMipsTarget() { RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget); } -static std::string computeDataLayout(StringRef TT, StringRef CPU, +static std::string computeDataLayout(const Triple &TT, StringRef CPU, const TargetOptions &Options, bool isLittle) { std::string Ret = ""; - MipsABIInfo ABI = - MipsABIInfo::computeTargetABI(Triple(TT), CPU, Options.MCOptions); + MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions); // There are both little and big endian mips. if (isLittle) @@ -83,7 +82,7 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, // offset from the stack/frame pointer, using StackGrowsUp enables // an easier handling. // Using CodeModel::Large enables different CALL behavior. -MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT, +MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -91,7 +90,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT, : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, RM, CM, OL), isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()), - ABI(MipsABIInfo::computeTargetABI(Triple(TT), CPU, Options.MCOptions)), + ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)), Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this), NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16", isLittle, *this), @@ -105,21 +104,21 @@ MipsTargetMachine::~MipsTargetMachine() {} void MipsebTargetMachine::anchor() { } -MipsebTargetMachine:: -MipsebTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void MipselTargetMachine::anchor() { } -MipselTargetMachine:: -MipselTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} const MipsSubtarget * MipsTargetMachine::getSubtargetImpl(const Function &F) const { @@ -157,7 +156,8 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, *this); + I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, + *this); } return I.get(); } diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h index 5427d6a..976970c 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h @@ -39,8 +39,8 @@ class MipsTargetMachine : public LLVMTargetMachine { mutable StringMap<std::unique_ptr<MipsSubtarget>> SubtargetMap; public: - MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + MipsTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); ~MipsTargetMachine() override; @@ -73,8 +73,8 @@ public: class MipsebTargetMachine : public MipsTargetMachine { virtual void anchor(); public: - MipsebTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; @@ -84,12 +84,12 @@ public: class MipselTargetMachine : public MipsTargetMachine { virtual void anchor(); public: - MipselTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h index fed0600..39cadc1 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h @@ -248,5 +248,5 @@ public: void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; void emitMipsAbiFlags(); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h index 02c5a21..8144f3f 100644 --- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h @@ -49,6 +49,6 @@ public: raw_ostream &O, const char *Modifier = nullptr); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index a72ae2e..b55664e 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -94,7 +94,7 @@ enum { IsSurfTexQueryFlag = 0x800, IsTexModeUnifiedFlag = 0x1000 }; -} -} +} // namespace NVPTXII +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index d500105..8a28b089 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -45,7 +45,7 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) { } static MCSubtargetInfo * -createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { +createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitNVPTXMCSubtargetInfo(X, TT, CPU, FS); return X; diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h index a2d670f..1480b61 100644 --- a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h +++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h @@ -43,6 +43,6 @@ public: } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h index 477b0ba..d06d61f 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h @@ -70,6 +70,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); +BasicBlockPass *createNVPTXLowerAllocaPass(); bool isImageOrSamplerVal(const Value *, const Module *); @@ -132,7 +133,7 @@ enum VecType { V2 = 2, V4 = 4 }; -} +} // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration namespace PTXCvtMode { @@ -151,7 +152,7 @@ enum CvtMode { FTZ_FLAG = 0x10, SAT_FLAG = 0x20 }; -} +} // namespace PTXCvtMode /// PTXCmpMode - Comparison mode enumeration namespace PTXCmpMode { @@ -179,9 +180,9 @@ enum CmpMode { BASE_MASK = 0xFF, FTZ_FLAG = 0x100 }; -} -} -} // end namespace llvm; +} // namespace PTXCmpMode +} // namespace NVPTX +} // namespace llvm // Defines symbolic names for NVPTX registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 298b992..1a1a8ca 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -109,7 +109,7 @@ void VisitGlobalVariableForEmission( Visited.insert(GV); Visiting.erase(GV); } -} +} // namespace void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) { if (!EmitLineNumbers) @@ -808,7 +808,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { // Construct a default subtarget off of the TargetMachine defaults. The // rest of NVPTX isn't friendly to change subtargets per function and // so the default TargetMachine will have all of the options. - StringRef TT = TM.getTargetTriple(); + const Triple &TT = TM.getTargetTriple(); StringRef CPU = TM.getTargetCPU(); StringRef FS = TM.getTargetFeatureString(); const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); @@ -818,7 +818,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { raw_svector_ostream OS1(Str1); MMI = getAnalysisIfAvailable<MachineModuleInfo>(); - MMI->AnalyzeModule(M); // We need to call the parent's one explicitly. //bool Result = AsmPrinter::doInitialization(M); @@ -847,7 +846,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { } // If we're not NVCL we're CUDA, go ahead and emit filenames. - if (Triple(TM.getTargetTriple()).getOS() != Triple::NVCL) + if (TM.getTargetTriple().getOS() != Triple::NVCL) recordAndEmitFilenames(M); GlobalsEmitted = false; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index f6f7685..12d80a3 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -349,6 +349,6 @@ public: DebugLoc prevDebugLoc; void emitLineNumberAsDotLoc(const MachineInstr &); }; -} // end of namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp index 7d4be8e..2d5e74c 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -38,7 +38,7 @@ public: /// \brief Clean up the name to remove symbols invalid in PTX. std::string cleanUpName(StringRef Name); }; -} +} // namespace char NVPTXAssignValidGlobalNames::ID = 0; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp index cfff001..3eb7024 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -98,17 +98,16 @@ private: /// This reordering exposes to optimizeMemoryInstruction more /// optimization opportunities on loads and stores. /// - /// Returns true if this function succesfully hoists an eliminable - /// addrspacecast or V is already such an addrspacecast. - /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, - /// indices)". - bool hoistAddrSpaceCastFrom(Value *V, int Depth = 0); + /// If this function succesfully hoists an eliminable addrspacecast or V is + /// already such an addrspacecast, it returns the transformed value (which is + /// guaranteed to be an addrspacecast); otherwise, it returns nullptr. + Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0); /// Helper function for GEPs. - bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth); + Value *hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth); /// Helper function for bitcasts. - bool hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth); + Value *hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth); }; -} +} // namespace char NVPTXFavorNonGenericAddrSpaces::ID = 0; @@ -143,17 +142,19 @@ static bool isEliminableAddrSpaceCast(Value *V) { DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); } -bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP, - int Depth) { - if (!hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1)) - return false; +Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( + GEPOperator *GEP, int Depth) { + Value *NewOperand = + hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1); + if (NewOperand == nullptr) + return nullptr; - // That hoistAddrSpaceCastFrom succeeds implies GEP's pointer operand is now - // an eliminable addrspacecast. - assert(isEliminableAddrSpaceCast(GEP->getPointerOperand())); - Operator *Cast = cast<Operator>(GEP->getPointerOperand()); + // hoistAddrSpaceCastFrom returns an eliminable addrspacecast or nullptr. + assert(isEliminableAddrSpaceCast(NewOperand)); + Operator *Cast = cast<Operator>(NewOperand); SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end()); + Value *NewASC; if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) { // GEP = gep (addrspacecast X), indices // => @@ -163,30 +164,31 @@ bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP, GEP->getSourceElementType(), Cast->getOperand(0), Indices, "", GEPI); NewGEP->setIsInBounds(GEP->isInBounds()); - Value *NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI); + NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI); NewASC->takeName(GEP); + // Without RAUWing GEP, the compiler would visit GEP again and emit + // redundant instructions. This is exercised in test @rauw in + // access-non-generic.ll. GEP->replaceAllUsesWith(NewASC); } else { // GEP is a constant expression. Constant *NewGEP = ConstantExpr::getGetElementPtr( GEP->getSourceElementType(), cast<Constant>(Cast->getOperand(0)), Indices, GEP->isInBounds()); - GEP->replaceAllUsesWith( - ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType())); + NewASC = ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType()); } - - return true; + return NewASC; } -bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast( +Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast( BitCastOperator *BC, int Depth) { - if (!hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1)) - return false; + Value *NewOperand = hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1); + if (NewOperand == nullptr) + return nullptr; - // That hoistAddrSpaceCastFrom succeeds implies BC's source operand is now - // an eliminable addrspacecast. - assert(isEliminableAddrSpaceCast(BC->getOperand(0))); - Operator *Cast = cast<Operator>(BC->getOperand(0)); + // hoistAddrSpaceCastFrom returns an eliminable addrspacecast or nullptr. + assert(isEliminableAddrSpaceCast(NewOperand)); + Operator *Cast = cast<Operator>(NewOperand); // Cast = addrspacecast Src // BC = bitcast Cast @@ -197,31 +199,34 @@ bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast( Type *TypeOfNewCast = PointerType::get(BC->getType()->getPointerElementType(), Src->getType()->getPointerAddressSpace()); + Value *NewBC; if (BitCastInst *BCI = dyn_cast<BitCastInst>(BC)) { Value *NewCast = new BitCastInst(Src, TypeOfNewCast, "", BCI); - Value *NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI); + NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI); NewBC->takeName(BC); + // Without RAUWing BC, the compiler would visit BC again and emit + // redundant instructions. This is exercised in test @rauw in + // access-non-generic.ll. BC->replaceAllUsesWith(NewBC); } else { // BC is a constant expression. Constant *NewCast = ConstantExpr::getBitCast(cast<Constant>(Src), TypeOfNewCast); - Constant *NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType()); - BC->replaceAllUsesWith(NewBC); + NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType()); } - return true; + return NewBC; } -bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V, - int Depth) { - // Returns true if V is already an eliminable addrspacecast. +Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V, + int Depth) { + // Returns V if V is already an eliminable addrspacecast. if (isEliminableAddrSpaceCast(V)) - return true; + return V; // Limit the depth to prevent this recursive function from running too long. const int MaxDepth = 20; if (Depth >= MaxDepth) - return false; + return nullptr; // If V is a GEP or bitcast, hoist the addrspacecast if any from its pointer // operand. This enables optimizeMemoryInstruction to shortcut addrspacecasts @@ -232,28 +237,29 @@ bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V, if (BitCastOperator *BC = dyn_cast<BitCastOperator>(V)) return hoistAddrSpaceCastFromBitCast(BC, Depth); - return false; + return nullptr; } bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, unsigned Idx) { - if (hoistAddrSpaceCastFrom(MI->getOperand(Idx))) { - // load/store (addrspacecast X) => load/store X if shortcutting the - // addrspacecast is valid and can improve performance. - // - // e.g., - // %1 = addrspacecast float addrspace(3)* %0 to float* - // %2 = load float* %1 - // -> - // %2 = load float addrspace(3)* %0 - // - // Note: the addrspacecast can also be a constant expression. - assert(isEliminableAddrSpaceCast(MI->getOperand(Idx))); - Operator *ASC = dyn_cast<Operator>(MI->getOperand(Idx)); - MI->setOperand(Idx, ASC->getOperand(0)); - return true; - } - return false; + Value *NewOperand = hoistAddrSpaceCastFrom(MI->getOperand(Idx)); + if (NewOperand == nullptr) + return false; + + // load/store (addrspacecast X) => load/store X if shortcutting the + // addrspacecast is valid and can improve performance. + // + // e.g., + // %1 = addrspacecast float addrspace(3)* %0 to float* + // %2 = load float* %1 + // -> + // %2 = load float addrspace(3)* %0 + // + // Note: the addrspacecast can also be a constant expression. + assert(isEliminableAddrSpaceCast(NewOperand)); + Operator *ASC = dyn_cast<Operator>(NewOperand); + MI->setOperand(Idx, ASC->getOperand(0)); + return true; } bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h index 14f8bb7..488edec 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -31,6 +31,6 @@ public: MachineBasicBlock::iterator I) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index fe20580..5879df3 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -95,6 +95,6 @@ private: bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; }; -} +} // namespace #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index ed94775..276851f 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -427,7 +427,7 @@ enum NodeType : unsigned { Suld3DV4I16Zero, Suld3DV4I32Zero }; -} +} // namespace NVPTXISD class NVPTXSubtarget; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index aa36b6b..c86f861 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -42,7 +42,7 @@ private: Value *cleanupValue(Value *V); void replaceWith(Instruction *From, ConstantInt *To); }; -} +} // namespace char NVPTXImageOptimizer::ID = 0; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index dabc3be..76d6597 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -248,7 +248,7 @@ unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { unsigned NVPTXInstrInfo::InsertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const { + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 9b5d491..179c068 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -66,7 +66,7 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override; + ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const { return MI.getOperand(2).getImm(); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp new file mode 100644 index 0000000..93d0025 --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -0,0 +1,115 @@ +//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// For all alloca instructions, and add a pair of cast to local address for +// each of them. For example, +// +// %A = alloca i32 +// store i32 0, i32* %A ; emits st.u32 +// +// will be transformed to +// +// %A = alloca i32 +// %Local = addrspacecast i32* %A to i32 addrspace(5)* +// %Generic = addrspacecast i32 addrspace(5)* %A to i32* +// store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32 +// +// And we will rely on NVPTXFavorNonGenericAddrSpace to combine the last +// two instructions. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace llvm { +void initializeNVPTXLowerAllocaPass(PassRegistry &); +} + +namespace { +class NVPTXLowerAlloca : public BasicBlockPass { + bool runOnBasicBlock(BasicBlock &BB) override; + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXLowerAlloca() : BasicBlockPass(ID) {} + const char *getPassName() const override { + return "convert address space of alloca'ed memory to local"; + } +}; +} // namespace + +char NVPTXLowerAlloca::ID = 1; + +INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", + "Lower Alloca", false, false) + +// ============================================================================= +// Main function for this pass. +// ============================================================================= +bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (auto &I : BB) { + if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { + Changed = true; + auto PTy = dyn_cast<PointerType>(allocaInst->getType()); + auto ETy = PTy->getElementType(); + auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); + auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, + GenericAddrTy, ""); + NewASCToLocal->insertAfter(allocaInst); + NewASCToGeneric->insertAfter(NewASCToLocal); + for (Value::use_iterator UI = allocaInst->use_begin(), + UE = allocaInst->use_end(); + UI != UE; ) { + // Check Load, Store, GEP, and BitCast Uses on alloca and make them + // use the converted generic address, in order to expose non-generic + // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types + // of instructions this is unecessary and may introduce redudant + // address cast. + const auto &AllocaUse = *UI++; + auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); + if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) { + LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto SI = dyn_cast<StoreInst>(AllocaUse.getUser()); + if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { + SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser()); + if (GI && GI->getPointerOperand() == allocaInst) { + GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser()); + if (BI && BI->getOperand(0) == allocaInst) { + BI->setOperand(0, NewASCToGeneric); + continue; + } + } + } + } + return Changed; +} + +BasicBlockPass *llvm::createNVPTXLowerAllocaPass() { + return new NVPTXLowerAlloca(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h index 10f1135..4b9322c 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h @@ -46,6 +46,6 @@ public: return ImageHandleList[Idx].c_str(); } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 5fd69a6..ea58f77 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -39,7 +39,7 @@ public: private: void calculateFrameObjectOffsets(MachineFunction &Fn); }; -} +} // namespace MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() { return new NVPTXPrologEpilogPass(); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 6e97f9e..3ef997b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -69,7 +69,7 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { } return ""; } -} +} // namespace llvm NVPTXRegisterInfo::NVPTXRegisterInfo() : NVPTXGenRegisterInfo(0) {} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index e83f735..bb0adc5 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -45,7 +45,7 @@ private: bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx); }; -} +} // namespace char NVPTXReplaceImageHandles::ID = 0; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 069d6e1..71645dc 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -43,7 +43,7 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, +NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM) : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM), diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index e9833e5..d452045 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -52,7 +52,7 @@ public: /// This constructor initializes the data members to match that /// of the specified module. /// - NVPTXSubtarget(const std::string &TT, const std::string &CPU, + NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM); const TargetFrameLowering *getFrameLowering() const override { @@ -103,6 +103,6 @@ public: void ParseSubtargetFeatures(StringRef CPU, StringRef FS); }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index a646668..c071ee8 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); void initializeNVPTXLowerKernelArgsPass(PassRegistry &); +void initializeNVPTXLowerAllocaPass(PassRegistry &); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -70,6 +71,7 @@ extern "C" void LLVMInitializeNVPTXTarget() { initializeNVPTXFavorNonGenericAddrSpacesPass( *PassRegistry::getPassRegistry()); initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry()); + initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry()); } static std::string computeDataLayout(bool is64Bit) { @@ -83,7 +85,7 @@ static std::string computeDataLayout(bool is64Bit) { return Ret; } -NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -92,7 +94,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, CM, OL), is64bit(is64bit), TLOF(make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { - if (Triple(TT).getOS() == Triple::NVCL) + if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; else drvInterface = NVPTX::CUDA; @@ -103,18 +105,20 @@ NVPTXTargetMachine::~NVPTXTargetMachine() {} void NVPTXTargetMachine32::anchor() {} -NVPTXTargetMachine32::NVPTXTargetMachine32( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) +NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void NVPTXTargetMachine64::anchor() {} -NVPTXTargetMachine64::NVPTXTargetMachine64( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) +NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} namespace { @@ -164,12 +168,11 @@ void NVPTXPassConfig::addIRPasses() { addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); - addPass(createNVPTXFavorNonGenericAddrSpacesPass()); // NVPTXLowerKernelArgs emits alloca for byval parameters which can often - // be eliminated by SROA. We do not run SROA right after NVPTXLowerKernelArgs - // because we plan to merge NVPTXLowerKernelArgs and - // NVPTXFavorNonGenericAddrSpaces into one pass. + // be eliminated by SROA. addPass(createSROAPass()); + addPass(createNVPTXLowerAllocaPass()); + addPass(createNVPTXFavorNonGenericAddrSpacesPass()); // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave // them unused. We could remove dead code in an ad-hoc manner, but that // requires manual work and might be error-prone. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index 2cd10e8..da7f62b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -34,9 +34,10 @@ class NVPTXTargetMachine : public LLVMTargetMachine { ManagedStringPool ManagedStrPool; public: - NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); + NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, + bool is64bit); ~NVPTXTargetMachine() override; const NVPTXSubtarget *getSubtargetImpl(const Function &) const override { @@ -67,7 +68,7 @@ public: class NVPTXTargetMachine32 : public NVPTXTargetMachine { virtual void anchor(); public: - NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU, + NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); @@ -76,7 +77,7 @@ public: class NVPTXTargetMachine64 : public NVPTXTargetMachine { virtual void anchor(); public: - NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU, + NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 7e2ce73..4d937c6 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -91,6 +91,6 @@ void dumpInstRec(Value *v, std::set<Instruction *> *visited); void dumpInstRec(Value *v); void dumpParent(Value *v); -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 5e375b7..1c20430 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -75,7 +75,7 @@ private: bool handleFunction(Function *ReflectFunction); void setVarMap(); }; -} +} // namespace ModulePass *llvm::createNVVMReflectPass() { return new NVVMReflect(); diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 1736d03..a699a55 100644 --- a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1184,6 +1184,13 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, Inst = TmpInst; break; } + case PPC::MFTB: { + if (STI.getFeatureBits()[PPC::FeatureMFTB]) { + assert(Inst.getNumOperands() == 2 && "Expecting two operands"); + Inst.setOpcode(PPC::MFSPR); + } + break; + } } } diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 72742dc..b6dd595 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -230,11 +230,11 @@ namespace { MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - if (Triple(TT).isOSDarwin()) + const Triple &TT, StringRef CPU) { + if (TT.isOSDarwin()) return new DarwinPPCAsmBackend(T); - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS()); - bool IsLittleEndian = Triple(TT).getArch() == Triple::ppc64le; + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); + bool IsLittleEndian = TT.getArch() == Triple::ppc64le; return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI); } diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 992be5b..36119d5 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -31,7 +31,7 @@ namespace { bool needsRelocateWithSymbol(const MCSymbol &Sym, unsigned Type) const override; }; -} +} // namespace PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) : MCELFObjectTargetWriter(Is64Bit, OSABI, diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index ae43e59d..ad614f2 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -50,7 +50,7 @@ enum Fixups { LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; -} -} +} // namespace PPC +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 9537924..b729156 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -309,7 +309,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, // Return the thread-pointer register's encoding. Fixups.push_back(MCFixup::create(0, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_nofixup)); - Triple TT(STI.getTargetTriple()); + const Triple &TT = STI.getTargetTriple(); bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le; return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2); } diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 1e8e804..489905b 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -63,8 +63,8 @@ static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitPPCMCSubtargetInfo(X, TT, CPU, FS); return X; @@ -219,7 +219,7 @@ public: llvm_unreachable("Unknown pseudo-op: .localentry"); } }; -} +} // namespace static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, @@ -230,7 +230,7 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, static MCTargetStreamer * createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { - Triple TT(STI.getTargetTriple()); + const Triple &TT = STI.getTargetTriple(); if (TT.getObjectFormat() == Triple::ELF) return new PPCTargetELFStreamer(S); return new PPCTargetMachOStreamer(S); diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 5f2117c..18818a1 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -29,6 +29,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class Triple; class StringRef; class raw_pwrite_stream; class raw_ostream; @@ -42,7 +43,7 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); /// Construct an PPC ELF object writer. MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, @@ -80,7 +81,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { return false; } -} // End llvm namespace +} // namespace llvm // Generated files will use "namespace PPC". To avoid symbol clash, // undefine PPC here. PPC may be predefined on some hosts. diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 9d72896..9b5491f 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -51,7 +51,7 @@ public: FixedValue); } }; -} +} // namespace /// computes the log2 of the size of the relocation, /// used for relocation_info::r_length. diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index 6075631..ff9b059 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -62,7 +62,7 @@ namespace PPC { /// Assume the condition register is set by MI(a,b), return the predicate if /// we modify the instructions such that condition register is set by MI(b,a). Predicate getSwappedPredicate(Predicate Opcode); -} -} +} // namespace PPC +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h index ae8d8b4..49f77b5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.h +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -98,6 +98,6 @@ namespace llvm { }; } // end namespace PPCII -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td index 1a02bcc..641b237 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.td +++ b/contrib/llvm/lib/Target/PowerPC/PPC.td @@ -135,9 +135,9 @@ def FeatureInvariantFunctionDescriptors : "Assume function descriptors are invariant">; def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", "Enable Hardware Transactional Memory instructions">; +def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", + "Implement mftb using the mfspr instruction">; -def DeprecatedMFTB : SubtargetFeature<"", "DeprecatedMFTB", "true", - "Treat mftb as deprecated">; def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true", "Treat vector data stream cache control instructions as deprecated">; @@ -165,7 +165,7 @@ def ProcessorFeatures { FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureBPERMD, FeatureExtDiv, - DeprecatedMFTB, DeprecatedDST]; + FeatureMFTB, DeprecatedDST]; list<SubtargetFeature> Power8SpecificFeatures = [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic]; @@ -247,61 +247,75 @@ include "PPCInstrInfo.td" // PowerPC processors supported. // -def : Processor<"generic", G3Itineraries, [Directive32]>; +def : Processor<"generic", G3Itineraries, [Directive32, FeatureMFTB]>; def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL, FeatureFRES, FeatureFRSQRTE, FeatureICBT, FeatureBookE, - FeatureMSYNC, DeprecatedMFTB]>; + FeatureMSYNC, FeatureMFTB]>; def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL, FeatureFRES, FeatureFRSQRTE, FeatureICBT, FeatureBookE, - FeatureMSYNC, DeprecatedMFTB]>; + FeatureMSYNC, FeatureMFTB]>; def : Processor<"601", G3Itineraries, [Directive601]>; -def : Processor<"602", G3Itineraries, [Directive602]>; +def : Processor<"602", G3Itineraries, [Directive602, + FeatureMFTB]>; def : Processor<"603", G3Itineraries, [Directive603, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"603e", G3Itineraries, [Directive603, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"603ev", G3Itineraries, [Directive603, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"604", G3Itineraries, [Directive604, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"604e", G3Itineraries, [Directive604, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"620", G3Itineraries, [Directive620, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"750", G4Itineraries, [Directive750, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"g3", G3Itineraries, [Directive750, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec, - FeatureFRES, FeatureFRSQRTE]>; + FeatureFRES, FeatureFRSQRTE, + FeatureMFTB]>; def : ProcessorModel<"970", G5Model, [Directive970, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX, - Feature64Bit /*, Feature64BitRegs */]>; + Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB]>; def : ProcessorModel<"g5", G5Model, [Directive970, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, FeatureFRES, FeatureFRSQRTE, Feature64Bit /*, Feature64BitRegs */, - DeprecatedMFTB, DeprecatedDST]>; + FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"e500mc", PPCE500mcModel, [DirectiveE500mc, FeatureMFOCRF, FeatureSTFIWX, FeatureICBT, FeatureBookE, - FeatureISEL, DeprecatedMFTB]>; + FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"e5500", PPCE5500Model, [DirectiveE5500, FeatureMFOCRF, Feature64Bit, FeatureSTFIWX, FeatureICBT, FeatureBookE, - FeatureISEL, DeprecatedMFTB]>; + FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"a2", PPCA2Model, [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, @@ -309,7 +323,7 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit - /*, Feature64BitRegs */, DeprecatedMFTB]>; + /*, Feature64BitRegs */, FeatureMFTB]>; def : ProcessorModel<"a2q", PPCA2Model, [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, @@ -317,7 +331,7 @@ def : ProcessorModel<"a2q", PPCA2Model, FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit - /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>; + /*, Feature64BitRegs */, FeatureQPX, FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, @@ -325,41 +339,42 @@ def : ProcessorModel<"pwr3", G5Model, def : ProcessorModel<"pwr4", G5Model, [DirectivePwr4, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, - FeatureSTFIWX, Feature64Bit]>; + FeatureSTFIWX, Feature64Bit, FeatureMFTB]>; def : ProcessorModel<"pwr5", G5Model, [DirectivePwr5, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, FeatureSTFIWX, Feature64Bit, - DeprecatedMFTB, DeprecatedDST]>; + FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr5x", G5Model, [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, FeatureSTFIWX, FeatureFPRND, Feature64Bit, - DeprecatedMFTB, DeprecatedDST]>; + FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr6", G5Model, [DirectivePwr6, FeatureAltivec, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB, FeatureFPRND, Feature64Bit /*, Feature64BitRegs */, - DeprecatedMFTB, DeprecatedDST]>; + FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr6x", G5Model, [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB, FeatureFPRND, Feature64Bit, - DeprecatedMFTB, DeprecatedDST]>; + FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>; def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>; -def : Processor<"ppc", G3Itineraries, [Directive32]>; +def : Processor<"ppc", G3Itineraries, [Directive32, FeatureMFTB]>; def : ProcessorModel<"ppc64", G5Model, [Directive64, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX, - Feature64Bit /*, Feature64BitRegs */]>; + Feature64Bit /*, Feature64BitRegs */, + FeatureMFTB]>; def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index b42b0f9..87a5236 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -440,7 +440,7 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; bool isPPC64 = Subtarget->isPPC64(); - bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin(); + bool isDarwin = TM.getTargetTriple().isOSDarwin(); const Module *M = MF->getFunction()->getParent(); PICLevel::Level PL = M->getPICLevel(); @@ -1276,7 +1276,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { // freed) and since we're at the global level we can use the default // constructed subtarget. std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( - TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString())); + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); auto EmitToStreamer = [&STI] (MCStreamer &S, const MCInst &Inst) { S.EmitInstruction(Inst, *STI); }; @@ -1510,7 +1511,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { static AsmPrinter * createPPCAsmPrinterPass(TargetMachine &tm, std::unique_ptr<MCStreamer> &&Streamer) { - if (Triple(tm.getTargetTriple()).isMacOSX()) + if (tm.getTargetTriple().isMacOSX()) return new PPCDarwinAsmPrinter(tm, std::move(Streamer)); return new PPCLinuxAsmPrinter(tm, std::move(Streamer)); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index 940d55a..2b6030a 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -51,7 +51,7 @@ namespace { } }; char PPCBSel::ID = 0; -} +} // namespace INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index 69afd68..4161317 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -417,8 +417,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; - Triple TT = Triple(L->getHeader()->getParent()->getParent()-> - getTargetTriple()); + const Triple TT = + Triple(L->getHeader()->getParent()->getParent()->getTargetTriple()); if (!TT.isArch32Bit() && !TT.isArch64Bit()) return MadeChange; // Unknown arch. type. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h index eb904a8..550cac6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h @@ -29,7 +29,7 @@ inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index fc89753..9cd9c2f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -191,7 +191,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE, "PowerPC Early-Return Creation", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp index a561d5b..82ff530 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -2347,4 +2347,4 @@ namespace llvm { return new PPCFastISel(FuncInfo, LibInfo); return nullptr; } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h index 28d074e..b232863 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -93,6 +93,6 @@ public: const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index afc1f36..5f9f9f2 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -234,7 +234,7 @@ private: SDNode *transferMemOperands(SDNode *N, SDNode *Result); }; -} +} // namespace /// InsertVRSaveCode - Once the entire function has been instruction selected, /// all virtual registers are created and all machine instructions are built, @@ -1301,12 +1301,9 @@ class BitPermutationSelector { // Now, remove all groups with this underlying value and rotation // factor. - for (auto I = BitGroups.begin(); I != BitGroups.end();) { - if (I->V == VRI.V && I->RLAmt == VRI.RLAmt) - I = BitGroups.erase(I); - else - ++I; - } + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; + }); } } @@ -1337,12 +1334,9 @@ class BitPermutationSelector { } // Now, remove all groups with this underlying value and rotation factor. - for (auto I = BitGroups.begin(); I != BitGroups.end();) { - if (I->V == VRI.V && I->RLAmt == VRI.RLAmt) - I = BitGroups.erase(I); - else - ++I; - } + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; + }); } if (InstCnt) *InstCnt += BitGroups.size(); @@ -1544,7 +1538,7 @@ class BitPermutationSelector { // Repl32 true, but are trivially convertable to Repl32 false. Such a // group is trivially convertable if it overlaps only with the lower 32 // bits, and the group has not been coalesced. - auto MatchingBG = [VRI](BitGroup &BG) { + auto MatchingBG = [VRI](const BitGroup &BG) { if (VRI.V != BG.V) return false; @@ -1675,12 +1669,7 @@ class BitPermutationSelector { // Now, remove all groups with this underlying value and rotation // factor. - for (auto I = BitGroups.begin(); I != BitGroups.end();) { - if (MatchingBG(*I)) - I = BitGroups.erase(I); - else - ++I; - } + eraseMatchingBitGroups(MatchingBG); } } @@ -1740,12 +1729,10 @@ class BitPermutationSelector { // Now, remove all groups with this underlying value and rotation factor. if (Res) - for (auto I = BitGroups.begin(); I != BitGroups.end();) { - if (I->V == VRI.V && I->RLAmt == VRI.RLAmt && I->Repl32 == VRI.Repl32) - I = BitGroups.erase(I); - else - ++I; - } + eraseMatchingBitGroups([VRI](const BitGroup &BG) { + return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt && + BG.Repl32 == VRI.Repl32; + }); } // Because 64-bit rotates are more flexible than inserts, we might have a @@ -1846,6 +1833,11 @@ class BitPermutationSelector { return nullptr; } + void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) { + BitGroups.erase(std::remove_if(BitGroups.begin(), BitGroups.end(), F), + BitGroups.end()); + } + SmallVector<ValueBit, 64> Bits; bool HasZeros; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2600ee5..1cdfb41 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3765,7 +3765,7 @@ struct TailCallArgumentInfo { TailCallArgumentInfo() : FrameIdx(0) {} }; -} +} // namespace /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. static void diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h index 7fd3f9c..c33d605 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -353,7 +353,7 @@ namespace llvm { /// the last operand. TOC_ENTRY }; - } + } // namespace PPCISD /// Define some predicates that are used for node matching. namespace PPC { @@ -405,7 +405,7 @@ namespace llvm { /// If this is a qvaligni shuffle mask, return the shift /// amount, otherwise return -1. int isQVALIGNIShuffleMask(SDNode *N); - } + } // namespace PPC class PPCTargetLowering : public TargetLowering { const PPCSubtarget &Subtarget; @@ -871,6 +871,6 @@ namespace llvm { CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -} +} // namespace llvm #endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index e27bf7f5..9ff604b 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1142,7 +1142,9 @@ def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef), def:Pat<(vpkudum_swapped_shuffle v16i8:$vA, v16i8:$vB), (VPKUDUM $vB, $vA)>; - +def VGBBD : VX2_Int_Ty2<1292, "vgbbd", int_ppc_altivec_vgbbd, v16i8, v16i8>; +def VBPERMQ : VX1_Int_Ty2<1356, "vbpermq", int_ppc_altivec_vbpermq, + v2i64, v16i8>; } // end HasP8Altivec // Crypto instructions (from builtins) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h index cf71b1c..ec94fa5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h @@ -38,6 +38,6 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0, return MIB.addFrameIndex(FI).addImm(Offset); } -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index b4bb50c..d3bb7a6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -548,7 +548,7 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { unsigned PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -593,7 +593,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, // Select analysis. bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { if (!Subtarget.hasISEL()) @@ -634,8 +634,7 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc dl, - unsigned DestReg, - const SmallVectorImpl<MachineOperand> &Cond, + unsigned DestReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { assert(Cond.size() == 2 && "PPC branch conditions have two components!"); @@ -1213,9 +1212,8 @@ bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { return !isPredicated(MI); } -bool PPCInstrInfo::PredicateInstruction( - MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const { +bool PPCInstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const { unsigned OpC = MI->getOpcode(); if (OpC == PPC::BLR || OpC == PPC::BLR8) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { @@ -1306,9 +1304,8 @@ bool PPCInstrInfo::PredicateInstruction( return false; } -bool PPCInstrInfo::SubsumesPredicate( - const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const { +bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { assert(Pred1.size() == 2 && "Invalid PPC first predicate"); assert(Pred2.size() == 2 && "Invalid PPC second predicate"); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 7fd076a..39bf454 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -106,7 +106,7 @@ public: UseNode, UseIdx); } - bool hasLowDefLatency(const InstrItineraryData *ItinData, + bool hasLowDefLatency(const TargetSchedModel &SchedModel, const MachineInstr *DefMI, unsigned DefIdx) const override { // Machine LICM should hoist all instructions in low-register-pressure @@ -141,18 +141,14 @@ public: bool AllowModify) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; // Select analysis. - bool canInsertSelect(const MachineBasicBlock&, - const SmallVectorImpl<MachineOperand> &Cond, - unsigned, unsigned, int&, int&, int&) const override; - void insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DstReg, - const SmallVectorImpl<MachineOperand> &Cond, + bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int &, int &, int &) const override; + void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, @@ -211,10 +207,10 @@ public: bool isUnpredicatedTerminator(const MachineInstr *MI) const override; bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const override; + ArrayRef<MachineOperand> Pred) const override; - bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const override; + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; @@ -241,6 +237,6 @@ public: void getNoopForMachoTarget(MCInst &NopInst) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td index c5a044c..b50124d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -2225,7 +2225,7 @@ def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT), "mtspr $SPR, $RT", IIC_SprMTSPR>; def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR), - "mftb $RT, $SPR", IIC_SprMFTB>, Deprecated<DeprecatedMFTB>; + "mftb $RT, $SPR", IIC_SprMFTB>; // A pseudo-instruction used to implement the read of the 64-bit cycle counter // on a 32-bit target. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp index b4e1c09..e783b5e 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp @@ -88,7 +88,7 @@ namespace { const TargetTransformInfo *TTI; const DataLayout *DL; }; -} +} // namespace char PPCLoopDataPrefetch::ID = 0; INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index b6e7799..1891b63 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -87,7 +87,7 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; }; -} +} // namespace char PPCLoopPreIncPrep::ID = 0; static const char *name = "Prepare loop for pre-inc. addressing modes"; @@ -113,7 +113,7 @@ namespace { protected: ScalarEvolution *SE; }; -} +} // namespace static bool IsPtrInBounds(Value *BasePtr) { Value *StrippedBasePtr = BasePtr; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 05cb6e1..c44d5d7 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -40,7 +40,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ Mangler *Mang = AP.Mang; const DataLayout *DL = TM.getDataLayout(); MCContext &Ctx = AP.OutContext; - bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin(); + bool isDarwin = TM.getTargetTriple().isOSDarwin(); SmallString<128> Name; StringRef Suffix; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h index 2c1378d..d2eaeb4 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h @@ -26,6 +26,6 @@ public: ~PPCSelectionDAGInfo(); }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index f313b0a..cf603fe 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -47,7 +47,7 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, +PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const PPCTargetMachine &TM) : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), IsPPC64(TargetTriple.getArch() == Triple::ppc64 || @@ -91,7 +91,7 @@ void PPCSubtarget::initializeEnvironment() { IsPPC4xx = false; IsPPC6xx = false; IsE500 = false; - DeprecatedMFTB = false; + FeatureMFTB = false; DeprecatedDST = false; HasLazyResolverStubs = false; HasICBT = false; @@ -175,7 +175,7 @@ bool PPCSubtarget::enableMachineScheduler() const { } // This overrides the PostRAScheduler bit in the SchedModel for each CPU. -bool PPCSubtarget::enablePostMachineScheduler() const { return true; } +bool PPCSubtarget::enablePostRAScheduler() const { return true; } PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const { return TargetSubtargetInfo::ANTIDEP_ALL; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h index 8d95508..ea17e1c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -58,7 +58,7 @@ namespace PPC { DIR_PWR8, DIR_64 }; -} +} // namespace PPC class GlobalValue; class TargetMachine; @@ -110,7 +110,7 @@ protected: bool IsE500; bool IsPPC4xx; bool IsPPC6xx; - bool DeprecatedMFTB; + bool FeatureMFTB; bool DeprecatedDST; bool HasLazyResolverStubs; bool IsLittleEndian; @@ -135,8 +135,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - PPCSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, const PPCTargetMachine &TM); + PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const PPCTargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. @@ -237,7 +237,7 @@ public: bool isPPC4xx() const { return IsPPC4xx; } bool isPPC6xx() const { return IsPPC6xx; } bool isE500() const { return IsE500; } - bool isDeprecatedMFTB() const { return DeprecatedMFTB; } + bool isFeatureMFTB() const { return FeatureMFTB; } bool isDeprecatedDST() const { return DeprecatedDST; } bool hasICBT() const { return HasICBT; } bool hasInvariantFunctionDescriptors() const { @@ -274,7 +274,7 @@ public: // Scheduling customization. bool enableMachineScheduler() const override; // This overrides the PostRAScheduler bit in the SchedModel for each CPU. - bool enablePostMachineScheduler() const override; + bool enablePostRAScheduler() const override; AntiDepBreakMode getAntiDepBreakMode() const override; void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; @@ -286,6 +286,6 @@ public: bool enableSubRegLiveness() const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 2dc0d82..7a9db0f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -156,7 +156,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace INITIALIZE_PASS_BEGIN(PPCTLSDynamicCall, DEBUG_TYPE, "PowerPC TLS Dynamic Call Fixup", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp index bf165c9..61b963f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp @@ -145,7 +145,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace INITIALIZE_PASS(PPCTOCRegDeps, DEBUG_TYPE, "PowerPC TOC Register Dependencies", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 50d4395..074bc87 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -98,13 +98,12 @@ static std::string getDataLayoutString(const Triple &T) { return Ret; } -static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, StringRef TT) { +static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, + const Triple &TT) { std::string FullFS = FS; - Triple TargetTriple(TT); // Make sure 64-bit features are available when CPUname is generic - if (TargetTriple.getArch() == Triple::ppc64 || - TargetTriple.getArch() == Triple::ppc64le) { + if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) { if (!FullFS.empty()) FullFS = "+64bit," + FullFS; else @@ -165,14 +164,15 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, // with what are (currently) non-function specific overrides as it goes into the // LLVMTargetMachine constructor and then using the stored value in the // Subtarget constructor below it. -PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, +PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, getDataLayoutString(Triple(TT)), TT, CPU, + : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM, CM, OL), - TLOF(createTLOF(Triple(getTargetTriple()))), - TargetABI(computeTargetABI(Triple(TT), Options)) { + TLOF(createTLOF(getTargetTriple())), + TargetABI(computeTargetABI(TT, Options)) { initAsmInfo(); } @@ -180,23 +180,21 @@ PPCTargetMachine::~PPCTargetMachine() {} void PPC32TargetMachine::anchor() { } -PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, +PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { -} + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} void PPC64TargetMachine::anchor() { } -PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, +PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { -} + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} const PPCSubtarget * PPCTargetMachine::getSubtargetImpl(const Function &F) const { @@ -264,9 +262,8 @@ void PPCPassConfig::addIRPasses() { // For the BG/Q (or if explicitly requested), add explicit data prefetch // intrinsics. - bool UsePrefetching = - Triple(TM->getTargetTriple()).getVendor() == Triple::BGQ && - getOptLevel() != CodeGenOpt::None; + bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && + getOptLevel() != CodeGenOpt::None; if (EnablePrefetch.getNumOccurrences() > 0) UsePrefetching = EnablePrefetch; if (UsePrefetching) @@ -320,7 +317,7 @@ void PPCPassConfig::addMachineSSAOptimization() { TargetPassConfig::addMachineSSAOptimization(); // For little endian, remove where possible the vector swap instructions // introduced at code generation to normalize vector element order. - if (Triple(TM->getTargetTriple()).getArch() == Triple::ppc64le && + if (TM->getTargetTriple().getArch() == Triple::ppc64le && !DisableVSXSwapRemoval) addPass(createPPCVSXSwapRemovalPass()); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h index 7a49058..5c0f7e6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -32,8 +32,8 @@ private: mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap; public: - PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~PPCTargetMachine() override; @@ -50,7 +50,7 @@ public: } bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; } bool isPPC64() const { - Triple TT(getTargetTriple()); + const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; }; @@ -60,8 +60,8 @@ public: class PPC32TargetMachine : public PPCTargetMachine { virtual void anchor(); public: - PPC32TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; @@ -71,8 +71,8 @@ public: class PPC64TargetMachine : public PPCTargetMachine { virtual void anchor(); public: - PPC64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h index dbe7617..a5c4c23 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h @@ -22,6 +22,6 @@ public: virtual void emitAbiVersion(int AbiVersion) = 0; virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp index 5e3ae2a..537db65 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -165,7 +165,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE, "PowerPC VSX Copy Legalization", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index f352fa6..a029ddf 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -317,7 +317,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE, "PowerPC VSX FMA Mutation", false, false) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index e238669..939293a 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -809,7 +809,7 @@ void PPCVSXSwapRemoval::dumpSwapVector() { DEBUG(dbgs() << "\n"); } -} // end default namespace +} // namespace INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE, "PowerPC VSX Swap Removal", false, false) diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 6b3b51a..4a33f7f 100644 --- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -76,7 +76,9 @@ class SparcAsmParser : public MCTargetAsmParser { bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc); bool parseDirectiveWord(unsigned Size, SMLoc L); - bool is64Bit() const { return STI.getTargetTriple().startswith("sparcv9"); } + bool is64Bit() const { + return STI.getTargetTriple().getArchName().startswith("sparcv9"); + } void expandSET(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -945,6 +947,8 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, return false; } +// Determine if an expression contains a reference to the symbol +// "_GLOBAL_OFFSET_TABLE_". static bool hasGOTReference(const MCExpr *Expr) { switch (Expr->getKind()) { case MCExpr::Target: @@ -996,6 +1000,13 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_; + // Ugly: if a sparc assembly expression says "%hi(...)" but the + // expression within contains _GLOBAL_OFFSET_TABLE_, it REALLY means + // %pc22. Same with %lo -> %pc10. Worse, if it doesn't contain that, + // the meaning depends on whether the assembler was invoked with + // -KPIC or not: if so, it really means %got22/%got10; if not, it + // actually means what it said! Sigh, historical mistakes... + switch(VK) { default: break; case SparcMCExpr::VK_Sparc_LO: diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index 3e56b9e..59f011a 100644 --- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -41,7 +41,7 @@ public: raw_ostream &VStream, raw_ostream &CStream) const override; }; -} +} // namespace namespace llvm { extern Target TheSparcTarget, TheSparcV9Target, TheSparcelTarget; diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index 9388527..d1d7aaa 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -297,10 +297,8 @@ namespace { } // end anonymous namespace - MCAsmBackend *llvm::createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new ELFSparcAsmBackend(T, Triple(TT).getOS()); + const Triple &TT, StringRef CPU) { + return new ELFSparcAsmBackend(T, TT.getOS()); } diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 4f07ae2..800a5f2 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -31,8 +31,12 @@ namespace { protected: unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; + + bool needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const override; + }; -} +} // namespace unsigned SparcELFObjectWriter::GetRelocType(const MCValue &Target, const MCFixup &Fixup, @@ -105,6 +109,27 @@ unsigned SparcELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_SPARC_NONE; } +bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const { + switch (Type) { + default: + return false; + + // All relocations that use a GOT need a symbol, not an offset, as + // the offset of the symbol within the section is irrelevant to + // where the GOT entry is. Don't need to list all the TLS entries, + // as they're all marked as requiring a symbol anyways. + case ELF::R_SPARC_GOT10: + case ELF::R_SPARC_GOT13: + case ELF::R_SPARC_GOT22: + case ELF::R_SPARC_GOTDATA_HIX22: + case ELF::R_SPARC_GOTDATA_LOX10: + case ELF::R_SPARC_GOTDATA_OP_HIX22: + case ELF::R_SPARC_GOTDATA_OP_LOX10: + return true; + } +} + MCObjectWriter *llvm::createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, bool IsLittleEndian, diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h index 8d79396..34c58da 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h @@ -91,7 +91,7 @@ namespace llvm { LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; - } -} + } // namespace Sparc +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index d34c879..91d2eee 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -63,12 +63,11 @@ static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo * +createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); - Triple TheTriple(TT); if (CPU.empty()) - CPU = (TheTriple.getArch() == Triple::sparcv9) ? "v9" : "v8"; + CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8"; InitSparcMCSubtargetInfo(X, TT, CPU, FS); return X; } diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index 28e2119..8f62de4 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -25,6 +25,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class Triple; class StringRef; class raw_pwrite_stream; class raw_ostream; @@ -37,10 +38,10 @@ MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCObjectWriter *createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, bool IsLIttleEndian, uint8_t OSABI); -} // End llvm namespace +} // namespace llvm // Defines symbolic names for Sparc registers. This defines a mapping from // register name to register number. diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h index 96378d5..133af86 100644 --- a/contrib/llvm/lib/Target/Sparc/Sparc.h +++ b/contrib/llvm/lib/Target/Sparc/Sparc.h @@ -33,7 +33,7 @@ namespace llvm { void LowerSparcMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP); -} // end namespace llvm; +} // namespace llvm namespace llvm { // Enums corresponding to Sparc condition codes, both icc's and fcc's. These @@ -74,7 +74,7 @@ namespace llvm { FCC_ULE = 14+16, // Unordered or Less or Equal FCC_O = 15+16 // Ordered }; - } + } // namespace SPCC inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) { switch (CC) { diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h index bb3b788..3d73bbd 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h @@ -55,6 +55,6 @@ private: }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h index b6bc3d2..a4b9c79 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -49,7 +49,7 @@ namespace llvm { TLS_LD, TLS_CALL }; - } + } // namespace SPISD class SparcTargetLowering : public TargetLowering { const SparcSubtarget *Subtarget; diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index 4b70f16..f87cee4 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -229,7 +229,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, unsigned SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { assert(TBB && "InsertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h index 6e08418..b59dd89 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -73,8 +73,7 @@ public: unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; void copyPhysReg(MachineBasicBlock &MBB, @@ -97,6 +96,6 @@ public: unsigned getGlobalBaseReg(MachineFunction *MF) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h index 1047442..0471443 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h @@ -51,6 +51,6 @@ namespace llvm { void setLeafProc(bool rhs) { IsLeafProc = rhs; } bool isLeafProc() const { return IsLeafProc; } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h index 6818291..2ceae82 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h @@ -26,6 +26,6 @@ public: ~SparcSelectionDAGInfo() override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp index ce1105f..479b25d 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -49,7 +49,7 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, +SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, TargetMachine &TM, bool is64Bit) : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h index e6cf460..983b119 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -43,7 +43,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo { SparcFrameLowering FrameLowering; public: - SparcSubtarget(const std::string &TT, const std::string &CPU, + SparcSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, TargetMachine &TM, bool is64bit); const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; } diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp index d43cd9e..725d7f0 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp @@ -54,13 +54,13 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) { /// SparcTargetMachine ctor - Create an ILP32 architecture model /// -SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, +SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64bit) - : LLVMTargetMachine(T, computeDataLayout(Triple(TT), is64bit), TT, CPU, FS, - Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options, + RM, CM, OL), TLOF(make_unique<SparcELFTargetObjectFile>()), Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); @@ -106,19 +106,16 @@ void SparcPassConfig::addPreEmitPass(){ void SparcV8TargetMachine::anchor() { } -SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, - StringRef TT, StringRef CPU, - StringRef FS, +SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, - CodeModel::Model CM, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) { -} + : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void SparcV9TargetMachine::anchor() { } -SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, StringRef TT, +SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, @@ -127,7 +124,7 @@ SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, StringRef TT, void SparcelTargetMachine::anchor() {} -SparcelTargetMachine::SparcelTargetMachine(const Target &T, StringRef TT, +SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h index fd05b8c..903c2d1 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h +++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h @@ -24,10 +24,10 @@ class SparcTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; SparcSubtarget Subtarget; public: - SparcTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool is64bit); + SparcTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, + bool is64bit); ~SparcTargetMachine() override; const SparcSubtarget *getSubtargetImpl(const Function &) const override { @@ -46,9 +46,8 @@ public: class SparcV8TargetMachine : public SparcTargetMachine { virtual void anchor(); public: - SparcV8TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, + SparcV8TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; @@ -58,7 +57,7 @@ public: class SparcV9TargetMachine : public SparcTargetMachine { virtual void anchor(); public: - SparcV9TargetMachine(const Target &T, StringRef TT, StringRef CPU, + SparcV9TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); @@ -68,7 +67,7 @@ class SparcelTargetMachine : public SparcTargetMachine { virtual void anchor(); public: - SparcelTargetMachine(const Target &T, StringRef TT, StringRef CPU, + SparcelTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index 0e8a680..57eebe1 100644 --- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -111,7 +111,7 @@ bool SystemZMCAsmBackend::writeNopData(uint64_t Count, MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS()); + const Triple &TT, StringRef CPU) { + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); return new SystemZMCAsmBackend(OSABI); } diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 92681cf..8188210 100644 --- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -154,9 +154,8 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT, - StringRef CPU, - StringRef FS) { +static MCSubtargetInfo * +createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitSystemZMCSubtargetInfo(X, TT, CPU, FS); return X; diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 36ea750..0db48fe 100644 --- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -23,6 +23,7 @@ class MCRegisterInfo; class MCSubtargetInfo; class StringRef; class Target; +class Triple; class raw_pwrite_stream; class raw_ostream; @@ -84,7 +85,7 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII, MCAsmBackend *createSystemZMCAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCObjectWriter *createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI); } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 6399293..0eb3d65 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1113,8 +1113,8 @@ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store, if (V1 == V2 && End1 == End2) return false; - return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getAAInfo()), - AliasAnalysis::Location(V2, End2, Store->getAAInfo())); + return !AA->alias(MemoryLocation(V1, End1, Load->getAAInfo()), + MemoryLocation(V2, End2, Store->getAAInfo())); } bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const { diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 91e12c2..7584579 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -3292,7 +3292,7 @@ struct Permute { unsigned Operand; unsigned char Bytes[SystemZ::VectorBytes]; }; -} +} // namespace static const Permute PermuteForms[] = { // VMRHG @@ -3574,7 +3574,7 @@ struct GeneralShuffle { // The type of the shuffle result. EVT VT; }; -} +} // namespace // Add an extra undefined element to the shuffle. void GeneralShuffle::addUndef() { diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 4346850..5d4a34f 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -362,7 +362,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { unsigned SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // In this function we output 32-bit branches, which should always // have enough range. They can be shortened and relaxed by later code @@ -530,8 +530,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, } bool SystemZInstrInfo:: -PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const { +PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const { assert(Pred.size() == 2 && "Invalid condition"); unsigned CCValid = Pred[0].getImm(); unsigned CCMask = Pred[1].getImm(); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index e47f2ee..31c9db2 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -149,8 +149,7 @@ public: bool AllowModify) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const override; @@ -167,8 +166,7 @@ public: unsigned NumCyclesF, unsigned ExtraPredCyclesF, const BranchProbability &Probability) const override; bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const - override; + ArrayRef<MachineOperand> Pred) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp index 05aede3..eb5e5c0 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -32,8 +32,7 @@ SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { return *this; } -SystemZSubtarget::SystemZSubtarget(const std::string &TT, - const std::string &CPU, +SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), @@ -41,9 +40,9 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT, HasPopulationCount(false), HasFastSerialization(false), HasInterlockedAccess1(false), HasMiscellaneousExtensions(false), HasTransactionalExecution(false), HasProcessorAssist(false), - HasVector(false), - TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() {} + HasVector(false), TargetTriple(TT), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), + TSInfo(*TM.getDataLayout()), FrameLowering() {} // Return true if GV binds locally under reloc model RM. static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) { diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h index 9a1f593..f7eaf01c 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h @@ -56,7 +56,7 @@ private: SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); public: - SystemZSubtarget(const std::string &TT, const std::string &CPU, + SystemZSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM); const TargetFrameLowering *getFrameLowering() const override { diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index a34cdaf..00cbbd1 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -43,9 +43,8 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) { return VectorABI; } -static std::string computeDataLayout(StringRef TT, StringRef CPU, +static std::string computeDataLayout(const Triple &TT, StringRef CPU, StringRef FS) { - const Triple Triple(TT); bool VectorABI = UsesVectorABI(CPU, FS); std::string Ret = ""; @@ -53,7 +52,7 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, Ret += "E"; // Data mangling. - Ret += DataLayout::getManglingComponent(Triple); + Ret += DataLayout::getManglingComponent(TT); // Make sure that global data has at least 16 bits of alignment by // default, so that we can refer to it using LARL. We don't have any @@ -79,13 +78,13 @@ static std::string computeDataLayout(StringRef TT, StringRef CPU, return Ret; } -SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, +SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), - TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options, + RM, CM, OL), TLOF(make_unique<TargetLoweringObjectFileELF>()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h index 5ded07c..0a81e1f 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h @@ -27,7 +27,7 @@ class SystemZTargetMachine : public LLVMTargetMachine { SystemZSubtarget Subtarget; public: - SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, + SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp index d498bb1..19b5e2a 100644 --- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -44,8 +44,8 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx, const TargetMachine &TM) { Ctx = &ctx; DL = TM.getDataLayout(); - InitMCObjectFileInfo(TM.getTargetTriple(), - TM.getRelocationModel(), TM.getCodeModel(), *Ctx); + InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(), + TM.getCodeModel(), *Ctx); } TargetLoweringObjectFile::~TargetLoweringObjectFile() { diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp index 2824250..0b05303 100644 --- a/contrib/llvm/lib/Target/TargetMachine.cpp +++ b/contrib/llvm/lib/Target/TargetMachine.cpp @@ -38,7 +38,7 @@ using namespace llvm; // TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString, - StringRef TT, StringRef CPU, StringRef FS, + const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options) : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), CodeGenInfo(nullptr), AsmInfo(nullptr), MRI(nullptr), @@ -70,7 +70,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { RESET_OPTION(UnsafeFPMath, "unsafe-fp-math"); RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); - RESET_OPTION(DisableTailCalls, "disable-tail-calls"); } /// getRelocationModel - Returns the code generation relocation model. The diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp index 623b3e8..7199235 100644 --- a/contrib/llvm/lib/Target/TargetMachineC.cpp +++ b/contrib/llvm/lib/Target/TargetMachineC.cpp @@ -156,7 +156,7 @@ LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) { } char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) { - std::string StringRep = unwrap(T)->getTargetTriple(); + std::string StringRep = unwrap(T)->getTargetTriple().str(); return strdup(StringRep.c_str()); } diff --git a/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp index b2bb59e..87df7af 100644 --- a/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp +++ b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp @@ -40,7 +40,7 @@ bool TargetSubtargetInfo::enableRALocalReassignment( return true; } -bool TargetSubtargetInfo::enablePostMachineScheduler() const { +bool TargetSubtargetInfo::enablePostRAScheduler() const { return getSchedModel().PostRAScheduler; } diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 9eee4a0..6ba897b 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -1080,4 +1080,4 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, return new X86AsmInstrumentation(STI); } -} // End llvm namespace +} // namespace llvm diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 19ebcc4..341fc81 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -61,6 +61,6 @@ protected: unsigned InitialFrameReg; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index e896571..418f043 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -42,15 +42,16 @@ namespace { static const char OpPrecedence[] = { 0, // IC_OR - 1, // IC_AND - 2, // IC_LSHIFT - 2, // IC_RSHIFT - 3, // IC_PLUS - 3, // IC_MINUS - 4, // IC_MULTIPLY - 4, // IC_DIVIDE - 5, // IC_RPAREN - 6, // IC_LPAREN + 1, // IC_XOR + 2, // IC_AND + 3, // IC_LSHIFT + 3, // IC_RSHIFT + 4, // IC_PLUS + 4, // IC_MINUS + 5, // IC_MULTIPLY + 5, // IC_DIVIDE + 6, // IC_RPAREN + 7, // IC_LPAREN 0, // IC_IMM 0 // IC_REGISTER }; @@ -70,6 +71,7 @@ private: enum InfixCalculatorTok { IC_OR = 0, + IC_XOR, IC_AND, IC_LSHIFT, IC_RSHIFT, @@ -204,6 +206,12 @@ private: Val = Op1.second | Op2.second; OperandStack.push_back(std::make_pair(IC_IMM, Val)); break; + case IC_XOR: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Xor operation with an immediate and a register!"); + Val = Op1.second ^ Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; case IC_AND: assert (Op1.first == IC_IMM && Op2.first == IC_IMM && "And operation with an immediate and a register!"); @@ -232,6 +240,7 @@ private: enum IntelExprState { IES_OR, + IES_XOR, IES_AND, IES_LSHIFT, IES_RSHIFT, @@ -297,6 +306,21 @@ private: } PrevState = CurrState; } + void onXor() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_XOR; + IC.pushOperator(IC_XOR); + break; + } + PrevState = CurrState; + } void onAnd() { IntelExprState CurrState = State; switch (State) { @@ -473,6 +497,7 @@ private: case IES_MINUS: case IES_NOT: case IES_OR: + case IES_XOR: case IES_AND: case IES_LSHIFT: case IES_RSHIFT: @@ -496,7 +521,7 @@ private: PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT) && + PrevState == IES_NOT || PrevState == IES_XOR) && CurrState == IES_MINUS) { // Unary minus. No need to pop the minus operand because it was never // pushed. @@ -506,7 +531,7 @@ private: PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT) && + PrevState == IES_NOT || PrevState == IES_XOR) && CurrState == IES_NOT) { // Unary not. No need to pop the not operand because it was never // pushed. @@ -593,6 +618,7 @@ private: case IES_MINUS: case IES_NOT: case IES_OR: + case IES_XOR: case IES_AND: case IES_LSHIFT: case IES_RSHIFT: @@ -605,7 +631,7 @@ private: PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || PrevState == IES_LPAREN || PrevState == IES_LBRAC || - PrevState == IES_NOT) && + PrevState == IES_NOT || PrevState == IES_XOR) && (CurrState == IES_MINUS || CurrState == IES_NOT)) { State = IES_ERROR; break; @@ -1217,6 +1243,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::Star: SM.onStar(); break; case AsmToken::Slash: SM.onDivide(); break; case AsmToken::Pipe: SM.onOr(); break; + case AsmToken::Caret: SM.onXor(); break; case AsmToken::Amp: SM.onAnd(); break; case AsmToken::LessLess: SM.onLShift(); break; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 6e99c37..5b53fbe 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -69,7 +69,7 @@ namespace X86 { extern Target TheX86_32Target, TheX86_64Target; -} +} // namespace llvm static bool translateInstruction(MCInst &target, InternalInstruction &source, diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 62b6b73..ac484f3 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -140,6 +140,6 @@ public: private: bool HasCustomInstComment; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6e371da..2bee518 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -159,6 +159,6 @@ public: } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 1ac656d..2d85f84 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -426,7 +426,7 @@ namespace CU { UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF }; -} // end CU namespace +} // namespace CU class DarwinX86AsmBackend : public X86AsmBackend { const MCRegisterInfo &MRI; @@ -790,10 +790,8 @@ public: MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, + const Triple &TheTriple, StringRef CPU) { - Triple TheTriple(TT); - if (TheTriple.isOSBinFormatMachO()) return new DarwinX86_32AsmBackend(T, MRI, CPU); @@ -806,10 +804,8 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, + const Triple &TheTriple, StringRef CPU) { - Triple TheTriple(TT); - if (TheTriple.isOSBinFormatMachO()) { MachO::CPUSubTypeX86 CS = StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 85b0006..69e9c7b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -41,7 +41,7 @@ namespace X86 { /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; -} // end namespace X86; +} // namespace X86 /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. @@ -271,7 +271,7 @@ namespace X86II { /// register DI/EDI/ESI. RawFrmDst = 9, - /// RawFrmSrc - This form is for instructions that use the the source index + /// RawFrmSrc - This form is for instructions that use the source index /// register SI/ESI/ERI with a possible segment override, and also the /// destination index register DI/ESI/RDI. RawFrmDstSrc = 10, @@ -762,8 +762,8 @@ namespace X86II { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } -} +} // namespace X86II -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index a33468d..512afeb 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -28,7 +28,7 @@ namespace { unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} +} // namespace X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index 2943dd3..7c09e5d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -32,7 +32,8 @@ public: StringRef SymName; SymI->getName(SymName); uint64_t SymAddr; SymI->getAddress(SymAddr); uint64_t SymSize = SymI->getSize(); - int64_t Addend; getELFRelocationAddend(Rel, Addend); + auto *Obj = cast<ELFObjectFileBase>(Rel.getObjectFile()); + int64_t Addend = *Obj->getRelocationAddend(Rel.getRawDataRefImpl()); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 4899900..a523a32 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -28,7 +28,7 @@ enum Fixups { LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; -} -} +} // namespace X86 +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index cc98e55..431010d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -42,12 +42,11 @@ using namespace llvm; #define GET_SUBTARGETINFO_MC_DESC #include "X86GenSubtargetInfo.inc" -std::string X86_MC::ParseX86Triple(StringRef TT) { - Triple TheTriple(TT); +std::string X86_MC::ParseX86Triple(const Triple &TT) { std::string FS; - if (TheTriple.getArch() == Triple::x86_64) + if (TT.getArch() == Triple::x86_64) FS = "+64bit-mode,-32bit-mode,-16bit-mode"; - else if (TheTriple.getEnvironment() != Triple::CODE16) + else if (TT.getEnvironment() != Triple::CODE16) FS = "-64bit-mode,+32bit-mode,-16bit-mode"; else FS = "-64bit-mode,-32bit-mode,+16bit-mode"; @@ -55,7 +54,7 @@ std::string X86_MC::ParseX86Triple(StringRef TT) { return FS; } -unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { +unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) { if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; @@ -75,8 +74,8 @@ void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) { } } -MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { std::string ArchFS = X86_MC::ParseX86Triple(TT); if (!FS.empty()) { if (!ArchFS.empty()) @@ -219,15 +218,14 @@ static MCInstPrinter *createX86MCInstPrinter(const Triple &T, return nullptr; } -static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT, +static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple, MCContext &Ctx) { - Triple TheTriple(TT); if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64) return createX86_64MachORelocationInfo(Ctx); else if (TheTriple.isOSBinFormatELF()) return createX86_64ELFRelocationInfo(Ctx); // Default to the stock relocation info. - return llvm::createMCRelocationInfo(TT, Ctx); + return llvm::createMCRelocationInfo(TheTriple, Ctx); } static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index dcdae1d..020803b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -52,26 +52,26 @@ namespace N86 { } namespace X86_MC { - std::string ParseX86Triple(StringRef TT); +std::string ParseX86Triple(const Triple &TT); - unsigned getDwarfRegFlavour(Triple TT, bool isEH); +unsigned getDwarfRegFlavour(const Triple &TT, bool isEH); - void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); +void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); - /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. - /// do not need to go through TargetRegistry. - MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS); -} +/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. +/// do not need to go through TargetRegistry. +MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS); +} // namespace X86_MC MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); /// Construct an X86 Windows COFF machine code streamer which will generate /// PE/COFF format object files. @@ -98,7 +98,7 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); /// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); -} // End llvm namespace +} // namespace llvm // Defines symbolic names for X86 registers. This defines a mapping from diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 95acc07..773fbf4 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -69,7 +69,7 @@ public: FixedValue); } }; -} +} // namespace static bool isFixupKindRIPRel(unsigned Kind) { return Kind == X86::reloc_riprel_4byte || @@ -205,7 +205,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( if (Symbol->isTemporary() && Value) { const MCSection &Sec = Symbol->getSection(); if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) - Asm.addLocalUsedInReloc(*Symbol); + Symbol->setUsedInReloc(); } RelSymbol = Asm.getAtom(*Symbol); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index bd1bc99..7d262cd 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -31,7 +31,7 @@ namespace { bool IsCrossSection, const MCAsmBackend &MAB) const override; }; -} +} // namespace X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 92f42b6..dc6dd66 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -46,7 +46,7 @@ void X86WinCOFFStreamer::FinishImpl() { MCWinCOFFStreamer::FinishImpl(); } -} +} // namespace MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index ef3318b..1e7d942 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -431,4 +431,4 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { for (unsigned i = 1; i < NumElts; i++) Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } -} // llvm namespace +} // namespace llvm diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 14b6943..0139297 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -100,6 +100,6 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a scalar float move instruction as a shuffle mask. void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &ShuffleMask); -} // llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 8403ae6..80f4579 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -80,6 +80,6 @@ FunctionPass *createX86WinEHStatePass(); /// must run after prologue/epilogue insertion and before lowering /// the MachineInstr to MC. FunctionPass *createX86ExpandPseudoPass(); -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 64fc6d0..2051401 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -511,7 +511,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); @@ -585,7 +585,7 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { SmallString<128> Directive; raw_svector_ostream OS(Directive); StringRef Name = Sym->getName(); - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (TT.isKnownWindowsMSVCEnvironment()) OS << " /EXPORT:"; @@ -610,7 +610,7 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { } void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { - Triple TT(TM.getTargetTriple()); + const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { // All darwin targets use mach-o. @@ -674,6 +674,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } SM.serializeToStackMapSection(); + FM.serializeToFaultMapSection(); // Funny Darwin hack: This flag tells the linker that no global symbols // contain code that falls through to other global symbols (e.g. the obvious @@ -726,8 +727,10 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } - if (TT.isOSBinFormatELF()) + if (TT.isOSBinFormatELF()) { SM.serializeToStackMapSection(); + FM.serializeToFaultMapSection(); + } } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 3beeb17..acba211 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -12,6 +12,7 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/Target/TargetMachine.h" @@ -27,6 +28,7 @@ class MCSymbol; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget; StackMaps SM; + FaultMaps FM; void GenerateExportDirective(const MCSymbol *Sym, bool IsData); @@ -83,13 +85,15 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); public: explicit X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), SM(*this), SMShadowTracker(TM) {} + : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this), + SMShadowTracker(TM) {} const char *getPassName() const override { return "X86 Assembly / Object Emitter"; diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 4412125..6d6831b 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -99,7 +99,7 @@ private: }; char X86CallFrameOptimization::ID = 0; -} +} // namespace FunctionPass *llvm::createX86CallFrameOptimization() { return new X86CallFrameOptimization(); diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index 0eb2494..a377eb6 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -42,7 +42,7 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 1b00997..6a5a28e 100644 --- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -84,19 +84,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, int StackAdj = StackAdjust.getImm(); if (StackAdj) { - bool Is64Bit = STI->is64Bit(); - // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. - const bool Uses64BitFramePtr = - STI->isTarget64BitLP64() || STI->isTargetNaCl64(); - // Check if we should use LEA for SP. - bool UseLEAForSP = STI->useLeaForSP() && - X86FL->canUseLEAForSPInEpilogue(*MBB.getParent()); - unsigned StackPtr = TRI->getStackRegister(); // Check for possible merge with preceding ADD instruction. - StackAdj += X86FrameLowering::mergeSPUpdates(MBB, MBBI, StackPtr, true); - X86FrameLowering::emitSPUpdate(MBB, MBBI, StackPtr, StackAdj, Is64Bit, - Uses64BitFramePtr, UseLEAForSP, *TII, - *TRI); + StackAdj += X86FL->mergeSPUpdates(MBB, MBBI, true); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true); } // Jump to label or value in register. diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index b39c5ab..8305a04 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -44,7 +44,7 @@ class FixupLEAPass : public MachineFunctionPass { /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, /// try to replace it with an equivalent LEA instruction. - /// If replacement succeeds, then also process the the newly created + /// If replacement succeeds, then also process the newly created /// instruction. void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); @@ -91,7 +91,7 @@ private: const X86InstrInfo *TII; // Machine instruction info. }; char FixupLEAPass::ID = 0; -} +} // namespace MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 3b0bd03..6f1d8e5 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -279,7 +279,7 @@ namespace { void setKillFlags(MachineBasicBlock &MBB) const; }; char FPS::ID = 0; -} +} // namespace FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } @@ -544,7 +544,7 @@ namespace { return V < TE.from; } }; -} +} // namespace #ifndef NDEBUG static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index db58d9c..85c5b64 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -37,6 +37,20 @@ using namespace llvm; // FIXME: completely move here. extern cl::opt<bool> ForceStackAlign; +X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, + unsigned StackAlignOverride) + : TargetFrameLowering(StackGrowsDown, StackAlignOverride, + STI.is64Bit() ? -8 : -4), + STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { + // Cache a bunch of frame-related predicates for this subtarget. + SlotSize = TRI->getSlotSize(); + Is64Bit = STI.is64Bit(); + IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + StackPtr = TRI->getStackRegister(); +} + bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects() && !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); @@ -48,11 +62,9 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { /// Use a more nuanced condition. bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { - const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *> - (MF.getSubtarget().getRegisterInfo()); return hasReservedCallFrame(MF) || - (hasFP(MF) && !TRI->needsStackRealignment(MF)) - || TRI->hasBasePointer(MF); + (hasFP(MF) && !TRI->needsStackRealignment(MF)) || + TRI->hasBasePointer(MF); } // needsFrameIndexResolution - Do we need to perform FI resolution for @@ -74,10 +86,9 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || - RegInfo->needsStackRealignment(MF) || + TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || @@ -137,7 +148,7 @@ static unsigned getLEArOpcode(unsigned IsLP64) { /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const TargetRegisterInfo &TRI, + const TargetRegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); @@ -176,7 +187,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, unsigned Reg = MO.getReg(); if (!Reg) continue; - for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) Uses.insert(*AI); } @@ -203,23 +214,36 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } +/// Check whether or not the terminators of \p MBB needs to read EFLAGS. +static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { + for (const MachineInstr &MI : MBB.terminators()) { + bool BreakNext = false; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != X86::EFLAGS) + continue; + + // This terminator needs an eflag that is not defined + // by a previous terminator. + if (!MO.isDef()) + return true; + BreakNext = true; + } + if (BreakNext) + break; + } + return false; +} + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, int64_t NumBytes, - bool Is64BitTarget, bool Is64BitStackPtr, - bool UseLEA, const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI) { + int64_t NumBytes, bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; - unsigned Opc; - if (UseLEA) - Opc = getLEArOpcode(Is64BitStackPtr); - else - Opc = isSub - ? getSUBriOpcode(Is64BitStackPtr, Offset) - : getADDriOpcode(Is64BitStackPtr, Offset); uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); @@ -231,17 +255,17 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, unsigned Reg = 0; if (isSub && !isEAXLiveIn(*MBB.getParent())) - Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); + Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); else - Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); if (Reg) { - Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; + unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) .addImm(Offset); Opc = isSub - ? getSUBrrOpcode(Is64BitTarget) - : getADDrrOpcode(Is64BitTarget); + ? getSUBrrOpcode(Is64Bit) + : getADDrrOpcode(Is64Bit); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addReg(Reg); @@ -252,15 +276,15 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, } uint64_t ThisVal = std::min(Offset, Chunk); - if (ThisVal == (Is64BitTarget ? 8 : 4)) { + if (ThisVal == (Is64Bit ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub - ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) - : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); if (Reg) { - Opc = isSub - ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) - : (Is64BitTarget ? X86::POP64r : X86::POP32r); + unsigned Opc = isSub + ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) @@ -270,25 +294,59 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, } } - MachineInstr *MI = nullptr; - - if (UseLEA) { - MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), - StackPtr, false, isSub ? -ThisVal : ThisVal); - } else { - MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(ThisVal); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. - } - + MachineInstrBuilder MI = BuildStackAdjustment( + MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); if (isSub) - MI->setFlag(MachineInstr::FrameSetup); + MI.setMIFlag(MachineInstr::FrameSetup); Offset -= ThisVal; } } +MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, + int64_t Offset, bool InEpilogue) const { + assert(Offset != 0 && "zero offset stack adjustment requested"); + + // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue + // is tricky. + bool UseLEA; + if (!InEpilogue) { + UseLEA = STI.useLeaForSP(); + } else { + // If we can use LEA for SP but we shouldn't, check that none + // of the terminators uses the eflags. Otherwise we will insert + // a ADD that will redefine the eflags and break the condition. + // Alternatively, we could move the ADD, but this may not be possible + // and is an optimization anyway. + UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); + if (UseLEA && !STI.useLeaForSP()) + UseLEA = terminatorsNeedFlagsAsInput(MBB); + // If that assert breaks, that means we do not do the right thing + // in canUseAsEpilogue. + assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) && + "We shouldn't have allowed this insertion point"); + } + + MachineInstrBuilder MI; + if (UseLEA) { + MI = addRegOffset(BuildMI(MBB, MBBI, DL, + TII.get(getLEArOpcode(Uses64BitFramePtr)), + StackPtr), + StackPtr, false, Offset); + } else { + bool IsSub = Offset < 0; + uint64_t AbsOffset = IsSub ? -Offset : Offset; + unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) + : getADDriOpcode(Uses64BitFramePtr, AbsOffset); + MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(AbsOffset); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + return MI; +} + /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. static void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, @@ -315,8 +373,7 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, - bool doMergeWithPrevious) { + bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; @@ -345,6 +402,15 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, return Offset; } +void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + MCCFIInstruction CFIInst) const { + MachineFunction &MF = *MBB.getParent(); + unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); +} + void X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -353,7 +419,6 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); @@ -366,11 +431,8 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, unsigned Reg = I->getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = - MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, - Offset)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); } } @@ -394,10 +456,7 @@ static bool usesTheStack(const MachineFunction &MF) { void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL) { - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); - bool Is64Bit = STI.is64Bit(); + DebugLoc DL) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -463,13 +522,10 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) { // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. -static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { +uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); - unsigned SlotSize = RegInfo->getSlotSize(); - unsigned StackAlign = STI.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = getStackAlignment(); if (ForceStackAlign) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; @@ -479,6 +535,22 @@ static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { return MaxAlign; } +void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + uint64_t MaxAlign) const { + uint64_t Val = -MaxAlign; + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -565,40 +637,32 @@ static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { void X86FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { + assert(&STI == &MF.getSubtarget<X86Subtarget>() && + "MF used frame lowering for wrong subtarget"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); - bool Is64Bit = STI.is64Bit(); - // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. - const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); - bool IsWin64 = STI.isCallingConvWin64(Fn->getCallingConv()); - // Not necessarily synonymous with IsWin64. - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = - !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); - bool UseLEA = STI.useLeaForSP(); - unsigned SlotSize = RegInfo->getSlotSize(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; - unsigned StackPtr = RegInfo->getStackRegister(); - unsigned BasePtr = RegInfo->getBaseRegister(); + unsigned BasePtr = TRI->getBaseRegister(); DebugLoc DL; // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - if (TailCallReturnAddrDelta && IsWinEH) + if (TailCallReturnAddrDelta && IsWin64Prologue) report_fatal_error("Can't handle guaranteed tail call under win64 yet"); if (TailCallReturnAddrDelta < 0) @@ -621,10 +685,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && - !RegInfo->needsStackRealignment(MF) && + !TRI->needsStackRealignment(MF) && !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone + !IsWin64CC && // Win64 has no Red Zone !usesTheStack(MF) && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -637,14 +701,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), - StackPtr) - .addReg(StackPtr) - .addImm(-TailCallReturnAddrDelta) + BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, + /*InEpilogue=*/false) .setMIFlag(MachineInstr::FrameSetup); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } // Mapping for machine moves: @@ -674,7 +733,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); // Callee-saved registers are pushed on stack before the stack is realigned. - if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); // Get the offset of the stack slot for the EBP register, which is @@ -691,27 +750,22 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, - DwarfFramePtr, 2 * stackGrowth)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( + nullptr, DwarfFramePtr, 2 * stackGrowth)); } - if (NeedsWinEH) { + if (NeedsWinCFI) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } - if (!IsWinEH) { + if (!IsWin64Prologue) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), @@ -723,11 +777,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); } // Mark the FramePtr as live-in in every block. @@ -752,14 +804,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); StackOffset += stackGrowth; } - if (NeedsWinEH) { + if (NeedsWinCFI) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } @@ -768,24 +818,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. - if (!IsWinEH && RegInfo->needsStackRealignment(MF)) { + if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - uint64_t Val = -MaxAlign; - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), - StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); - - // The EFLAGS implicit def is dead. - MI->getOperand(3).setIsDead(); + BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); + NumBytes -= mergeSPUpdates(MBB, MBBI, true); // Adjust stack pointer: ESP -= numbytes. @@ -798,7 +839,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; - if (IsWinEH && RegInfo->needsStackRealignment(MF)) + if (IsWin64Prologue && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. @@ -859,17 +900,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, MBB.insert(MBBI, MI); } } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, - UseLEA, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } - if (NeedsWinEH && NumBytes) + if (NeedsWinCFI && NumBytes) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); int SEHFrameOffset = 0; - if (IsWinEH && HasFP) { + if (IsWin64Prologue && HasFP) { SEHFrameOffset = calculateSetFPREG(NumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), @@ -877,7 +917,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, else BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); - if (NeedsWinEH) + if (NeedsWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) @@ -888,7 +928,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, const MachineInstr *FrameInstr = &*MBBI; ++MBBI; - if (NeedsWinEH) { + if (NeedsWinCFI) { int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { @@ -904,32 +944,23 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } } - if (NeedsWinEH) + if (NeedsWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. - if (IsWinEH && RegInfo->needsStackRealignment(MF)) { + if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - uint64_t Val = -MaxAlign; - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), - StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); - - // The EFLAGS implicit def is dead. - MI->getOperand(3).setIsDead(); + BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. - if (RegInfo->hasBasePointer(MF)) { + if (TRI->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) @@ -950,12 +981,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, - -StackSize + stackGrowth)); - - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( + nullptr, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. @@ -975,65 +1002,24 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue( return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } -/// Check whether or not the terminators of \p MBB needs to read EFLAGS. -static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { - for (const MachineInstr &MI : MBB.terminators()) { - bool BreakNext = false; - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg != X86::EFLAGS) - continue; - - // This terminator needs an eflag that is not defined - // by a previous terminator. - if (!MO.isDef()) - return true; - BreakNext = true; - } - if (BreakNext) - break; - } - return false; -} - void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); - bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. - const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); const bool Is64BitILP32 = STI.isTarget64BitILP32(); - unsigned SlotSize = RegInfo->getSlotSize(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; - unsigned StackPtr = RegInfo->getStackRegister(); - - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); - bool UseLEAForSP = canUseLEAForSPInEpilogue(MF); - // If we can use LEA for SP but we shouldn't, check that none - // of the terminators uses the eflags. Otherwise we will insert - // a ADD that will redefine the eflags and break the condition. - // Alternatively, we could move the ADD, but this may not be possible - // and is an optimization anyway. - if (UseLEAForSP && !MF.getSubtarget<X86Subtarget>().useLeaForSP()) - UseLEAForSP = terminatorsNeedFlagsAsInput(MBB); - // If that assert breaks, that means we do not do the right thing - // in canUseAsEpilogue. - assert((UseLEAForSP || !terminatorsNeedFlagsAsInput(MBB)) && - "We shouldn't have allowed this insertion point"); + + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinCFI = + IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); @@ -1048,7 +1034,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Callee-saved registers were pushed on stack before the stack was // realigned. - if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); // Pop EBP. @@ -1083,11 +1069,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. - if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { - if (RegInfo->needsStackRealignment(MF)) + if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); - uint64_t LEAAmount = IsWinEH ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + uint64_t LEAAmount = + IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; // There are only two legal forms of epilogue: // - add SEHAllocationSize, %rsp @@ -1109,8 +1096,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, - UseLEAForSP, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true); --MBBI; } @@ -1120,7 +1106,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. - if (NeedsWinEH) + if (NeedsWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); // Add the return addr area delta back since we are not tail calling. @@ -1130,16 +1116,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MBBI = MBB.getFirstTerminator(); // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, - UseLEAForSP, TII, *RegInfo); + Offset += mergeSPUpdates(MBB, MBBI, true); + emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true); } } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); const MachineFrameInfo *MFI = MF.getFrameInfo(); // Offset will hold the offset from the stack pointer at function entry to the // object. @@ -1149,12 +1132,11 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI->getStackSize(); - unsigned SlotSize = RegInfo->getSlotSize(); bool HasFP = hasFP(MF); - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; - if (IsWinEH) { + if (IsWin64Prologue) { assert(!MFI->hasCalls() || (StackSize % 16) == 8); // Calculate required stack adjustment. @@ -1178,7 +1160,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, } - if (RegInfo->hasBasePointer(MF)) { + if (TRI->hasBasePointer(MF)) { assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. @@ -1187,7 +1169,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } - } else if (RegInfo->needsStackRealignment(MF)) { + } else if (TRI->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. return Offset + SlotSize + FPDelta; @@ -1214,17 +1196,15 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. - if (RegInfo->hasBasePointer(MF)) - FrameReg = RegInfo->getBaseRegister(); - else if (RegInfo->needsStackRealignment(MF)) - FrameReg = RegInfo->getStackRegister(); + if (TRI->hasBasePointer(MF)) + FrameReg = TRI->getBaseRegister(); + else if (TRI->needsStackRealignment(MF)) + FrameReg = TRI->getStackRegister(); else - FrameReg = RegInfo->getFrameRegister(MF); + FrameReg = TRI->getFrameRegister(MF); return getFrameIndexOffset(MF, FI); } @@ -1235,8 +1215,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); // Note: LLVM arranges the stack as: // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) // > "Stack Slots" (<--SP) @@ -1248,7 +1226,7 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. - assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); // We don't handle tail calls, and shouldn't be seeing them // either. @@ -1293,11 +1271,9 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); - assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - FrameReg = RegInfo->getStackRegister(); + FrameReg = TRI->getStackRegister(); return getFrameIndexOffsetFromSP(MF, FI); } @@ -1305,9 +1281,6 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CalleeSavedFrameSize = 0; @@ -1321,7 +1294,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. - unsigned FPReg = RegInfo->getFrameRegister(MF); + unsigned FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); @@ -1352,7 +1325,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; - const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); // ensure alignment SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); // spill into slot @@ -1372,10 +1345,6 @@ bool X86FrameLowering::spillCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); - MachineFunction &MF = *MBB.getParent(); - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); - // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { @@ -1419,10 +1388,6 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); - MachineFunction &MF = *MBB.getParent(); - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); - // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); @@ -1451,9 +1416,6 @@ void X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = - MF.getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -1473,8 +1435,8 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } // Spill the BasePtr if it's used. - if (RegInfo->hasBasePointer(MF)) - MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); + if (TRI->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(TRI->getBaseRegister()); } static bool @@ -1532,11 +1494,7 @@ static const uint64_t kSplitStackAvailable = 256; void X86FrameLowering::adjustForSegmentedStacks( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); uint64_t StackSize; - bool Is64Bit = STI.is64Bit(); - const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; DebugLoc DL; @@ -1782,12 +1740,7 @@ void X86FrameLowering::adjustForSegmentedStacks( /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = STI.getRegisterInfo()->getSlotSize(); - const bool Is64Bit = STI.is64Bit(); - const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; // HiPE-specific values const unsigned HipeLeafWords = 24; @@ -1915,14 +1868,9 @@ void X86FrameLowering::adjustForHiPEPrologue( void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); - const X86RegisterInfo &RegInfo = *STI.getRegisterInfo(); - unsigned StackPtr = RegInfo.getStackRegister(); bool reserveCallFrame = hasReservedCallFrame(MF); unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); - bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; @@ -1941,54 +1889,29 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = getStackAlignment(); Amount = RoundUpToAlignment(Amount, StackAlign); - MachineInstr *New = nullptr; - // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; if (Amount) { - if (Opcode == TII.getCallFrameSetupOpcode()) { - New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) - .addReg(StackPtr).addImm(Amount); - } else { - assert(Opcode == TII.getCallFrameDestroyOpcode()); - - unsigned Opc = getADDriOpcode(IsLP64, Amount); - New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(Amount); - } + // Add Amount to SP to destroy a frame, and subtract to setup. + int Offset = isDestroy ? Amount : -Amount; + BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); } - - if (New) { - // The EFLAGS implicit def is dead. - New->getOperand(3).setIsDead(); - - // Replace the pseudo instruction with a new instruction. - MBB.insert(I, New); - } - return; } - if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { + if (isDestroy && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. - unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); - MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(InternalAmt); - - // The EFLAGS implicit def is dead. - New->getOperand(3).setIsDead(); - // We are not tracking the stack pointer adjustment by the callee, so make // sure we restore the stack pointer immediately after the call, there may // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. MachineBasicBlock::iterator B = MBB.begin(); while (I != B && !std::prev(I)->isCall()) --I; - MBB.insert(I, New); + BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false); } } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 5d03b4d..2858e86 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -18,16 +18,40 @@ namespace llvm { +class MachineInstrBuilder; +class MCCFIInstruction; +class X86Subtarget; +class X86RegisterInfo; + class X86FrameLowering : public TargetFrameLowering { public: - explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) - : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} + X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride); + + // Cached subtarget predicates. + + const X86Subtarget &STI; + const TargetInstrInfo &TII; + const X86RegisterInfo *TRI; + + unsigned SlotSize; + + /// Is64Bit implies that x86_64 instructions are available. + bool Is64Bit; + + bool IsLP64; + + /// True if the 64-bit frame or stack pointer should be used. True for most + /// 64-bit targets with the exception of x32. If this is false, 32-bit + /// instruction operands should be used to manipulate StackPtr and FramePtr. + bool Uses64BitFramePtr; + + unsigned StackPtr; /// Emit a call to the target's stack probe function. This is required for all /// large stack allocations on Windows. The caller is required to materialize /// the number of bytes to probe in RAX/EAX. - static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL); + void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL) const; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -83,18 +107,13 @@ public: /// it is an ADD/SUB/LEA instruction it is deleted argument and the /// stack adjustment is returned as a positive value for ADD/LEA and /// a negative for SUB. - static int mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, bool doMergeWithPrevious); + int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + bool doMergeWithPrevious) const; /// Emit a series of instructions to increment / decrement the stack /// pointer by a constant value. - static void emitSPUpdate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, unsigned StackPtr, - int64_t NumBytes, bool Is64BitTarget, - bool Is64BitStackPtr, bool UseLEA, - const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI); + void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int64_t NumBytes, bool InEpilogue) const; /// Check that LEA can be used on SP in an epilogue sequence for \p MF. bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const; @@ -115,8 +134,25 @@ private: MachineBasicBlock &MBB, MachineBasicBlock::iterator I, uint64_t Amount) const; + + uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + + /// Wraps up getting a CFI index and building a MachineInstr for it. + void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, MCCFIInstruction CFIInst) const; + + /// Aligns the stack pointer by ANDing it with -MaxAlign. + void BuildStackAlignAND(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + uint64_t MaxAlign) const; + + /// Adjusts the stack pointer using LEA, SUB, or ADD. + MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, int64_t Offset, + bool InEpilogue) const; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index de59109..f6785e1 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -138,7 +138,7 @@ namespace { } #endif }; -} +} // namespace namespace { //===--------------------------------------------------------------------===// @@ -310,7 +310,7 @@ namespace { return true; } }; -} +} // namespace bool diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index e3ec288..ce1ca20 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -915,6 +915,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom @@ -2233,7 +2235,9 @@ static bool IsCCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; CallSite CS(CI); @@ -2762,8 +2766,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); - if (MF.getTarget().Options.DisableTailCalls) + if (Attr.getValueAsString() == "true") isTailCall = false; if (Subtarget->isPICStyleGOT() && @@ -5441,7 +5446,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop -/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. +/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI @@ -6353,7 +6358,7 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for -/// shuffling 8 lanes. +/// shuffling 8 lanes. static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, SelectionDAG &DAG) { assert(Mask.size() <= 8 && @@ -9380,6 +9385,30 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(PermMask, DL, MVT::i8)); } +/// \brief Handle lowering 4-lane 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> WidenedMask, + SelectionDAG &DAG) { + + assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!"); + // form a 128-bit permutation. + // convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if(WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // use first element in place of undef musk + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % 4) << (i * 2); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// @@ -10173,6 +10202,10 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG)) + return Op; // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) @@ -11023,9 +11056,8 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { if (Idx2->getZExtValue() == 0) { SDValue Ops[] = { SubVec2, SubVec }; - SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); - if (LD.getNode()) - return LD; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; } } } @@ -11617,15 +11649,21 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SrcVT.isVector()) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))); + } if (SrcVT.getVectorElementType() == MVT::i1) { MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, - Op.getOperand(0))); + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); } return SDValue(); } @@ -13018,11 +13056,11 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, RecipOp = "vec-sqrtf"; else return SDValue(); - + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; if (!Recips.isEnabled(RecipOp)) return SDValue(); - + RefinementSteps = Recips.getRefinementSteps(RecipOp); UseOneConstNR = false; return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); @@ -13035,7 +13073,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, unsigned &RefinementSteps) const { EVT VT = Op.getValueType(); const char *RecipOp; - + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -13050,7 +13088,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, RecipOp = "vec-divf"; else return SDValue(); - + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; if (!Recips.isEnabled(RecipOp)) return SDValue(); @@ -13236,13 +13274,13 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(-1, dl, VT)); switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: - // (x != y) -> ~(x ^ y) + case ISD::SETEQ: + // (x == y) -> ~(x ^ y) return DAG.getNode(ISD::XOR, dl, VT, DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), DAG.getConstant(-1, dl, VT)); - case ISD::SETEQ: - // (x == y) -> (x ^ y) + case ISD::SETNE: + // (x != y) -> (x ^ y) return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); case ISD::SETUGT: case ISD::SETGT: @@ -15107,7 +15145,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, RoundingMode), Mask, PassThru, Subtarget, DAG); @@ -15687,14 +15725,49 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } +static SDValue LowerEXCEPTIONINFO(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc dl(Op); + SDValue FnOp = Op.getOperand(2); + SDValue FPOp = Op.getOperand(3); + + // Compute the symbol for the parent EH registration. We know it'll get + // emitted later. + auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(FnOp)->getGlobal()); + MCSymbol *ParentFrameSym = + MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + StringRef Name = ParentFrameSym->getName(); + assert(Name.data()[Name.size()] == '\0' && "not null terminated"); + + // Create a TargetExternalSymbol for the label to avoid any target lowering + // that would make this PC relative. + MVT PtrVT = Op.getSimpleValueType(); + SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT); + SDValue OffsetVal = + DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSym); + + // Add the offset to the FP. + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, FPOp, OffsetVal); + + // Load the second field of the struct, which is 4 bytes in. See + // WinEHStatePass for more info. + Add = DAG.getNode(ISD::ADD, dl, PtrVT, Add, DAG.getConstant(4, dl, PtrVT)); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Add, MachinePointerInfo(), + false, false, false, 0); +} static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); - if (!IntrData) + if (!IntrData) { + if (IntNo == Intrinsic::x86_seh_exceptioninfo) + return LowerEXCEPTIONINFO(Op, Subtarget, DAG); return SDValue(); + } SDLoc dl(Op); switch(IntrData->Type) { @@ -16464,6 +16537,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + SDValue AhiBlo = Ahi; + SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; @@ -16473,11 +16548,15 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, Bhi = DAG.getBitcast(MulVT, Bhi); SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); - SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); - SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - - AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); - AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + // After shifting right const values the result may be all-zero. + if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + } + if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + } SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); @@ -16992,36 +17071,111 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } } - if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - // Turn 'a' into a mask suitable for VSELECT: a = a << 5; - Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT)); - - SDValue VSelM = DAG.getConstant(0x80, dl, VT); - SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - - // r = VSELECT(r, shl(r, 4), a); - SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT)); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); - - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { + MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); + unsigned ShiftOpcode = Op->getOpcode(); - // r = VSELECT(r, shl(r, 2), a); - M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT)); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } + // On pre-SSE41 targets we test for the sign bit by comparing to + // zero - a negative value will set all bits of the lanes to true + // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); + SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); + return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + }; - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + // Turn 'a' into a mask suitable for VSELECT: a = a << 5; + // We can safely do this using i16 shifts as we're only interested in + // the 3 lower bits of each byte. + Amt = DAG.getBitcast(ExtVT, Amt); + Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); + Amt = DAG.getBitcast(VT, Amt); + + if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { + // r = VSELECT(r, shift(r, 4), a); + SDValue M = + DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + return R; + } - // return VSELECT(r, r+r, a); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, - DAG.getNode(ISD::ADD, dl, VT, R, R), R); - return R; + if (Op->getOpcode() == ISD::SRA) { + // For SRA we need to unpack each byte to the higher byte of a i16 vector + // so we can correctly sign extend. We don't care what happens to the + // lower byte. + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); + ALo = DAG.getBitcast(ExtVT, ALo); + AHi = DAG.getBitcast(ExtVT, AHi); + RLo = DAG.getBitcast(ExtVT, RLo); + RHi = DAG.getBitcast(ExtVT, RHi); + + // r = VSELECT(r, shift(r, 4), a); + SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(4, dl, ExtVT)); + SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(4, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 2), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(2, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(2, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 1), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(1, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(1, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // Logical shift the result back to the lower byte, leaving a zero upper + // byte + // meaning that we can safely pack with PACKUSWB. + RLo = + DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); + RHi = + DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } } // It's worth extending once and using the v8i32 shifts for 16-bit types, but @@ -17055,6 +17209,67 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } + if (VT == MVT::v8i16) { + unsigned ShiftOpcode = Op->getOpcode(); + + auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); + V0 = DAG.getBitcast(ExtVT, V0); + V1 = DAG.getBitcast(ExtVT, V1); + Sel = DAG.getBitcast(ExtVT, Sel); + return DAG.getBitcast( + VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + } + // On pre-SSE41 targets we splat the sign bit - a negative value will + // set all bits of the lanes to true and VSELECT uses that in + // its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue C = + DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); + return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + }; + + // Turn 'a' into a mask suitable for VSELECT: a = a << 12; + if (Subtarget->hasSSE41()) { + // On SSE41 targets we need to replicate the shift mask in both + // bytes for PBLENDVB. + Amt = DAG.getNode( + ISD::OR, dl, VT, + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); + } else { + Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); + } + + // r = VSELECT(r, shift(r, 8), a); + SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 4), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(Amt, M, R); + return R; + } + // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); @@ -18290,6 +18505,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -18404,6 +18620,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; + case X86ISD::AVG: return "X86ISD::AVG"; + case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; + case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; } return nullptr; } @@ -19464,7 +19683,8 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, assert(!Subtarget->isTargetMachO()); - X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL); + Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, + DL); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -24019,7 +24239,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); - EVT InVT = N0->getValueType(0); + EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); SDLoc DL(N); @@ -24037,7 +24257,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, } if (!DCI.isBeforeLegalizeOps()) { - if (N0.getValueType() == MVT::i1) { + if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); @@ -24048,7 +24268,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (VT.isVector()) { auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) { - EVT InVT = N->getValueType(0); + EVT InVT = N.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), 128 / InVT.getScalarSizeInBits()); SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(), @@ -24470,18 +24690,19 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); - // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) + // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8 || + InVT == MVT::v8i16 || InVT == MVT::v4i16) { SDLoc dl(N); - MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + MVT DstVT = MVT::getVectorVT(MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); } @@ -24490,7 +24711,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); - EVT VT = Ld->getValueType(0); + EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 if (N->getValueType(0) == MVT::f16) @@ -24498,9 +24719,9 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !Subtarget->is64Bit() && VT == MVT::i64) { + !Subtarget->is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( - SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index b5d062f..9c98333 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -218,7 +218,8 @@ namespace llvm { // Integer add/sub with signed saturation. ADDS, SUBS, - + // Unsigned Integer average + AVG, /// Integer horizontal add. HADD, @@ -293,6 +294,9 @@ namespace llvm { // Vector FP round. VFPROUND, + // Vector signed integer to double. + CVTDQ2PD, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -417,6 +421,10 @@ namespace llvm { COMPRESS, EXPAND, + //Convert Unsigned/Integer to Scalar Floating-Point Value + //with rounding mode + SINT_TO_FP_RND, + UINT_TO_FP_RND, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, @@ -508,7 +516,7 @@ namespace llvm { // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be // thought as target memory ops! }; - } + } // namespace X86ISD /// Define some predicates that are used for node matching. namespace X86 { @@ -575,7 +583,7 @@ namespace llvm { TO_ZERO = 3, CUR_DIRECTION = 4 }; - } + } // namespace X86 //===--------------------------------------------------------------------===// // X86 Implementation of the TargetLowering interface @@ -1112,6 +1120,6 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } -} +} // namespace llvm #endif // X86ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index c1d0aef..de6a835 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1058,118 +1058,87 @@ def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM2I - 3 source operands form -- -multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC, - PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT, RegisterClass KRC> { +multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _> { let Constraints = "$src1 = $dst" in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, - EVEX_4V; - - def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst {${mask}}|" - "$dst {${mask}}, $src2, $src3}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode RC:$src1, RC:$src2, - RC:$src3), - RC:$src1)))]>, - EVEX_4V, EVEX_K; - - let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> - def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst {${mask}} {z} |", - "$dst {${mask}} {z}, $src2, $src3}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode RC:$src1, RC:$src2, - RC:$src3), - (OpVT (bitconvert - (v16i32 immAllZerosV))))))]>, - EVEX_4V, EVEX_KZ; + defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86memop:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3))))]>, EVEX_4V; + let mayLoad = 1 in + defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + EVEX_4V, AVX5128IBase; + } +} +multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (OpNode _.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; +} - def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst {${mask}}|" - "$dst {${mask}}, $src2, $src3}"), - [(set RC:$dst, - (OpVT (vselect KRC:$mask, - (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)), - RC:$src1)))]>, - EVEX_4V, EVEX_K; - - let AddedComplexity = 10 in // Prefer over the rrkz variant - def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst {${mask}} {z}|" - "$dst {${mask}} {z}, $src2, $src3}"), - [(set RC:$dst, - (OpVT (vselect KRC:$mask, - (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)), - (OpVT (bitconvert - (v16i32 immAllZerosV))))))]>, - EVEX_4V, EVEX_KZ; +multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, loadv16i32, - i512mem, X86VPermiv3, v16i32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, loadv8i64, - i512mem, X86VPermiv3, v8i64, VK8WM>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, loadv16f32, - i512mem, X86VPermiv3, v16f32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, loadv8f64, - i512mem, X86VPermiv3, v8f64, VK8WM>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC, - PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT, RegisterClass KRC, - ValueType MaskVT, RegisterClass MRC> : - avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode, - OpVT, KRC> { - def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") - VR512:$idx, VR512:$src1, VR512:$src2, -1)), - (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>; - - def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") - VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)), - (!cast<Instruction>(NAME#rrk) VR512:$src1, - (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; -} - -defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, loadv16i32, i512mem, - X86VPermv3, v16i32, VK16WM, v16i1, GR16>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, loadv8i64, i512mem, - X86VPermv3, v8i64, VK8WM, v8i1, GR8>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, loadv16f32, i512mem, - X86VPermv3, v16f32, VK16WM, v16i1, GR16>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, loadv8f64, i512mem, - X86VPermv3, v8f64, VK8WM, v8i1, GR8>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, + avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + } +} +defm VPERMI2D : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3, + avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3, + avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPERMT2D : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3, + avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3, + avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPERMT2W : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3, + avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2W : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3, + avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask @@ -2044,11 +2013,11 @@ defm : avx512_binop_pat<xor, KXORWrr>; def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)), (KXNORWrr VK16:$src1, VK16:$src2)>; def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), - (KXNORBrr VK8:$src1, VK8:$src2)>; + (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>; def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)), - (KXNORDrr VK32:$src1, VK32:$src2)>; + (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>; def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)), - (KXNORQrr VK64:$src1, VK64:$src2)>; + (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>; let Predicates = [NoDQI] in def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), @@ -3157,7 +3126,8 @@ defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; - +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, + SSE_INTALU_ITINS_P, HasBWI, 1>; multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, SDNode OpNode, bit IsCommutable = 0> { @@ -3278,30 +3248,6 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), - (VPMAXSDZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v16i32 (int_x86_avx512_mask_pmaxu_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), - (VPMAXUDZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v8i64 (int_x86_avx512_mask_pmaxs_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMAXSQZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v8i64 (int_x86_avx512_mask_pmaxu_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMAXUQZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v16i32 (int_x86_avx512_mask_pmins_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), - (VPMINSDZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v16i32 (int_x86_avx512_mask_pminu_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), - (VPMINUDZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v8i64 (int_x86_avx512_mask_pmins_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMINSQZrr VR512:$src1, VR512:$src2)>; -def : Pat <(v8i64 (int_x86_avx512_mask_pminu_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMINUQZrr VR512:$src1, VR512:$src2)>; //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// @@ -4191,29 +4137,72 @@ defm VFNMSUBSDZ : avx512_fma3s_rm<0xAF, "vfnmsub213sd", X86Fnmsub, FR64X, // AVX-512 Scalar convert from sign integer to float/double //===----------------------------------------------------------------------===// -multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm> { -let hasSideEffects = 0 in { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), +multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, X86MemOperand x86memop, + PatFrag ld_frag, string asm> { + let hasSideEffects = 0 in { + def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; - let mayLoad = 1 in - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), - (ins DstRC:$src1, x86memop:$src), + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; -} // hasSideEffects = 0 + } // hasSideEffects = 0 + let isCodeGenOnly = 1 in { + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 FROUND_CURRENT)))]>, EVEX_4V; + + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, x86memop:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + (ld_frag addr:$src2), + (i32 FROUND_CURRENT)))]>, EVEX_4V; + }//isCodeGenOnly = 1 +} + +multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, string asm> { + def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), + !strconcat(asm, + "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC; +} + +multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, X86MemOperand x86memop, + PatFrag ld_frag, string asm> { + defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>, + avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>, + VEX_LIG; } let Predicates = [HasAVX512] in { -defm VCVTSI2SSZ : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}">, - XS, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}">, - XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; -defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}">, - XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}">, - XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, + v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, + v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, + v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, + XD, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, + v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -4233,14 +4222,18 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}">, - XS, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}">, - XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}">, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR32, + v4f32x_info, i32mem, loadi32, + "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64, + v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86SuintToFpRnd, GR32, v2f64x_info, + i32mem, loadi32, "cvtusi2sd{l}">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}">, - XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64, + v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -4321,18 +4314,9 @@ let isCodeGenOnly = 1 in { int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; - defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, - int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}", - SSE_CVT_Scalar, 0>, XS, EVEX_4V; - defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, - int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}", - SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W; defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", SSE_CVT_Scalar, 0>, XD, EVEX_4V; - defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, - int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}", - SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; } // isCodeGenOnly = 1 // Convert float/double to signed/unsigned int 32/64 with truncation diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index 2056056..eb4dc48 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -179,6 +179,6 @@ addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); } -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index dfe58ef..16ae77d 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -72,6 +72,9 @@ def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", + SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -184,6 +187,7 @@ def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>; def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>; def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; +def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; @@ -350,6 +354,12 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; +def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; + +def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; +def X86SuintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b7a929..4aa0ae6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3456,11 +3456,11 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { return !isPredicated(MI); } -bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { +bool X86InstrInfo::AnalyzeBranchImpl( + MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const { + // Start from the bottom of the block and work up, examining the // terminator instructions. MachineBasicBlock::iterator I = MBB.end(); @@ -3558,6 +3558,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, FBB = TBB; TBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(BranchCode)); + CondBranches.push_back(I); continue; } @@ -3595,11 +3596,90 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // Update the MachineOperand. Cond[0].setImm(BranchCode); + CondBranches.push_back(I); } return false; } +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + SmallVector<MachineInstr *, 4> CondBranches; + return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify); +} + +bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const { + using namespace std::placeholders; + + SmallVector<MachineOperand, 4> Cond; + SmallVector<MachineInstr *, 4> CondBranches; + if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches, + AllowModify)) + return true; + + if (Cond.size() != 1) + return true; + + assert(MBP.TrueDest && "expected!"); + + if (!MBP.FalseDest) + MBP.FalseDest = MBB.getNextNode(); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + MachineInstr *ConditionDef = nullptr; + bool SingleUseCondition = true; + + for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) { + if (I->modifiesRegister(X86::EFLAGS, TRI)) { + ConditionDef = &*I; + break; + } + + if (I->readsRegister(X86::EFLAGS, TRI)) + SingleUseCondition = false; + } + + if (!ConditionDef) + return true; + + if (SingleUseCondition) { + for (auto *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + SingleUseCondition = false; + } + + MBP.ConditionDef = ConditionDef; + MBP.SingleUseCondition = SingleUseCondition; + + // Currently we only recognize the simple pattern: + // + // test %reg, %reg + // je %label + // + const unsigned TestOpcode = + Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr; + + if (ConditionDef->getOpcode() == TestOpcode && + ConditionDef->getNumOperands() == 3 && + ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) && + (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) { + MBP.LHS = ConditionDef->getOperand(0); + MBP.RHS = MachineOperand::CreateImm(0); + MBP.Predicate = Cond[0].getImm() == X86::COND_NE + ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; + return false; + } + + return true; +} + unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; @@ -3622,8 +3702,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -3671,7 +3750,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, bool X86InstrInfo:: canInsertSelect(const MachineBasicBlock &MBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. @@ -3708,8 +3787,7 @@ canInsertSelect(const MachineBasicBlock &MBB, void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, - const SmallVectorImpl<MachineOperand> &Cond, + unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); assert(Cond.size() == 1 && "Invalid Cond array"); @@ -3967,6 +4045,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } } +bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + const MCInstrDesc &Desc = MemOp->getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode()); + if (MemRefBegin < 0) + return false; + + MemRefBegin += X86II::getOperandBias(Desc); + + BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg(); + if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) + return false; + + if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != + X86::NoRegister) + return false; + + const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp); + + // Displacement can be symbolic + if (!DispMO.isImm()) + return false; + + Offset = DispMO.getImm(); + + return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == + X86::NoRegister); +} + static unsigned getStoreRegOpcode(unsigned SrcReg, const TargetRegisterClass *RC, bool isStackAligned, @@ -6219,13 +6327,217 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { } bool X86InstrInfo:: -hasHighOperandLatency(const InstrItineraryData *ItinData, +hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const { return isHighLatencyDef(DefMI->getOpcode()); } +/// If the input instruction is part of a chain of dependent ops that are +/// suitable for reassociation, return the earlier instruction in the sequence +/// that defines its first operand, otherwise return a nullptr. +/// If the instruction's operands must be commuted to be considered a +/// reassociation candidate, Commuted will be set to true. +static MachineInstr *isReassocCandidate(const MachineInstr &Inst, + unsigned AssocOpcode, + bool checkPrevOneUse, + bool &Commuted) { + if (Inst.getOpcode() != AssocOpcode) + return nullptr; + + MachineOperand Op1 = Inst.getOperand(1); + MachineOperand Op2 = Inst.getOperand(2); + + const MachineBasicBlock *MBB = Inst.getParent(); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + // We need virtual register definitions. + MachineInstr *MI1 = nullptr; + MachineInstr *MI2 = nullptr; + if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) + MI1 = MRI.getUniqueVRegDef(Op1.getReg()); + if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) + MI2 = MRI.getUniqueVRegDef(Op2.getReg()); + + // And they need to be in the trace (otherwise, they won't have a depth). + if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB) + return nullptr; + + Commuted = false; + if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) { + std::swap(MI1, MI2); + Commuted = true; + } + + // Avoid reassociating operands when it won't provide any benefit. If both + // operands are produced by instructions of this type, we may already + // have the optimal sequence. + if (MI2->getOpcode() == AssocOpcode) + return nullptr; + + // The instruction must only be used by the other instruction that we + // reassociate with. + if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) + return nullptr; + + // We must match a simple chain of dependent ops. + // TODO: This check is not necessary for the earliest instruction in the + // sequence. Instead of a sequence of 3 dependent instructions with the same + // opcode, we only need to find a sequence of 2 dependent instructions with + // the same opcode plus 1 other instruction that adds to the height of the + // trace. + if (MI1->getOpcode() != AssocOpcode) + return nullptr; + + return MI1; +} + +/// Select a pattern based on how the operands of each associative operation +/// need to be commuted. +static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev, + bool CommuteRoot) { + if (CommutePrev) { + if (CommuteRoot) + return MachineCombinerPattern::MC_REASSOC_XA_YB; + return MachineCombinerPattern::MC_REASSOC_XA_BY; + } else { + if (CommuteRoot) + return MachineCombinerPattern::MC_REASSOC_AX_YB; + return MachineCombinerPattern::MC_REASSOC_AX_BY; + } +} + +bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + return false; + + // TODO: There are many more associative instruction types to match: + // 1. Other forms of scalar FP add (non-AVX) + // 2. Other data types (double, integer, vectors) + // 3. Other math / logic operations (mul, and, or) + unsigned AssocOpcode = X86::VADDSSrr; + + // TODO: There is nothing x86-specific here except the instruction type. + // This logic could be hoisted into the machine combiner pass itself. + bool CommuteRoot; + if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true, + CommuteRoot)) { + bool CommutePrev; + if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) { + // We found a sequence of instructions that may be suitable for a + // reassociation of operands to increase ILP. + Patterns.push_back(getPattern(CommutePrev, CommuteRoot)); + return true; + } + } + + return false; +} + +/// Attempt the following reassociation to reduce critical path length: +/// B = A op X (Prev) +/// C = B op Y (Root) +/// ===> +/// B = X op Y +/// C = A op B +static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, + MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { + MachineFunction *MF = Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + // This array encodes the operand index for each parameter because the + // operands may be commuted. Each row corresponds to a pattern value, + // and each column specifies the index of A, B, X, Y. + unsigned OpIdx[4][4] = { + { 1, 1, 2, 2 }, + { 1, 2, 2, 1 }, + { 2, 1, 1, 2 }, + { 2, 2, 1, 1 } + }; + + MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); + MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); + MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); + MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); + MachineOperand &OpC = Root.getOperand(0); + + unsigned RegA = OpA.getReg(); + unsigned RegB = OpB.getReg(); + unsigned RegX = OpX.getReg(); + unsigned RegY = OpY.getReg(); + unsigned RegC = OpC.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(RegA)) + MRI.constrainRegClass(RegA, RC); + if (TargetRegisterInfo::isVirtualRegister(RegB)) + MRI.constrainRegClass(RegB, RC); + if (TargetRegisterInfo::isVirtualRegister(RegX)) + MRI.constrainRegClass(RegX, RC); + if (TargetRegisterInfo::isVirtualRegister(RegY)) + MRI.constrainRegClass(RegY, RC); + if (TargetRegisterInfo::isVirtualRegister(RegC)) + MRI.constrainRegClass(RegC, RC); + + // Create a new virtual register for the result of (X op Y) instead of + // recycling RegB because the MachineCombiner's computation of the critical + // path requires a new register definition rather than an existing one. + unsigned NewVR = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + + unsigned Opcode = Root.getOpcode(); + bool KillA = OpA.isKill(); + bool KillX = OpX.isKill(); + bool KillY = OpY.isKill(); + + // Create new instructions for insertion. + MachineInstrBuilder MIB1 = + BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegY, getKillRegState(KillY)); + InsInstrs.push_back(MIB1); + + MachineInstrBuilder MIB2 = + BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) + .addReg(RegA, getKillRegState(KillA)) + .addReg(NewVR, getKillRegState(true)); + InsInstrs.push_back(MIB2); + + // Record old instructions for deletion. + DelInstrs.push_back(&Prev); + DelInstrs.push_back(&Root); +} + +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, + MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { + MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); + + // Select the previous instruction in the sequence based on the input pattern. + MachineInstr *Prev = nullptr; + if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY || + Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY) + Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB || + Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB) + Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + else + llvm_unreachable("Unknown pattern for machine combiner"); + + reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); + return; +} + namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. @@ -6292,7 +6604,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace char CGBR::ID = 0; FunctionPass* @@ -6404,7 +6716,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} +} // namespace char LDTLSCleanup::ID = 0; FunctionPass* diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index ac1b2d4..4912951 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -26,6 +26,19 @@ namespace llvm { class X86RegisterInfo; class X86Subtarget; + namespace MachineCombinerPattern { + enum MC_PATTERN : int { + // These are commutative variants for reassociating a computation chain + // of the form: + // B = A op X (Prev) + // C = B op Y (Root) + MC_REASSOC_AX_BY = 0, + MC_REASSOC_AX_YB = 1, + MC_REASSOC_XA_BY = 2, + MC_REASSOC_XA_YB = 3, + }; + } // end namespace MachineCombinerPattern + namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -77,7 +90,7 @@ namespace X86 { /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); -} // end namespace X86; +} // namespace X86 /// isGlobalStubReference - Return true if the specified TargetFlag operand is @@ -166,6 +179,12 @@ class X86InstrInfo final : public X86GenInstrInfo { virtual void anchor(); + bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + SmallVectorImpl<MachineInstr *> &CondBranches, + bool AllowModify) const; + public: explicit X86InstrInfo(X86Subtarget &STI); @@ -254,18 +273,23 @@ public: MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const override; + bool AnalyzeBranchPredicate(MachineBasicBlock &MBB, + TargetInstrInfo::MachineBranchPredicate &MBP, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; - bool canInsertSelect(const MachineBasicBlock&, - const SmallVectorImpl<MachineOperand> &Cond, + bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond, unsigned, unsigned, int&, int&, int&) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DstReg, - const SmallVectorImpl<MachineOperand> &Cond, + unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, @@ -423,12 +447,32 @@ public: bool isHighLatencyDef(int opc) const override; - bool hasHighOperandLatency(const InstrItineraryData *ItinData, + bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; + + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// output in the <Pattern> array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override; + + /// When getMachineCombinerPatterns() finds a pattern, this function generates + /// the instructions that could replace the original code sequence. + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction @@ -468,6 +512,6 @@ private: int &FrameIndex) const; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 8294e38..9562918 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -2234,14 +2234,27 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; -// AVX 256-bit register conversion intrinsics +// AVX register conversion intrinsics let Predicates = [HasAVX] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (VCVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (VCVTDQ2PDrm addr:$src)>; + def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PDYrr VR128:$src)>; def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PDYrm addr:$src)>; } // Predicates = [HasAVX] +// SSE2 register conversion intrinsics +let Predicates = [HasSSE2] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (CVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (CVTDQ2PDrm addr:$src)>; +} // Predicates = [HasSSE2] + // Convert packed double to packed single // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 0268066..2b82930 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -242,6 +242,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), @@ -469,6 +476,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), @@ -493,6 +506,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index ff1436a..64135e0 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -17,6 +17,7 @@ #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "Utils/X86ShuffleDecode.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -50,6 +51,8 @@ class X86MCInstLower { public: X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter); + Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const; void Lower(const MachineInstr *MI, MCInst &OutMI) const; MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; @@ -109,7 +112,7 @@ namespace llvm { OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo()); } -} // end llvm namespace +} // namespace llvm X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) @@ -402,47 +405,43 @@ static unsigned getRetOpcode(const X86Subtarget &Subtarget) { return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; } +Optional<MCOperand> +X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const { + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) + return None; + return MCOperand::createReg(MO.getReg()); + case MachineOperand::MO_Immediate: + return MCOperand::createImm(MO.getImm()); + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + return LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); + case MachineOperand::MO_JumpTableIndex: + return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); + case MachineOperand::MO_ConstantPoolIndex: + return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex())); + case MachineOperand::MO_BlockAddress: + return LowerSymbolOperand( + MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); + case MachineOperand::MO_RegisterMask: + // Ignore call clobbers. + return None; + } +} + void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - - MCOperand MCOp; - switch (MO.getType()) { - default: - MI->dump(); - llvm_unreachable("unknown operand type"); - case MachineOperand::MO_Register: - // Ignore all implicit register operands. - if (MO.isImplicit()) continue; - MCOp = MCOperand::createReg(MO.getReg()); - break; - case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); - break; - case MachineOperand::MO_MachineBasicBlock: - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); - break; - case MachineOperand::MO_JumpTableIndex: - MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); - break; - case MachineOperand::MO_ConstantPoolIndex: - MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex())); - break; - case MachineOperand::MO_BlockAddress: - MCOp = LowerSymbolOperand(MO, - AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); - break; - case MachineOperand::MO_RegisterMask: - // Ignore call clobbers. - continue; - } - - OutMI.addOperand(MCOp); - } + for (const MachineOperand &MO : MI->operands()) + if (auto MaybeMCOp = LowerMachineOperand(MI, MO)) + OutMI.addOperand(MaybeMCOp.getValue()); // Handle a few special cases to eliminate operand modifiers. ReSimplify: @@ -865,6 +864,28 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, SM.recordStatepoint(MI); } +void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, + X86MCInstLower &MCIL) { + // FAULTING_LOAD_OP <def>, <handler label>, <load opcode>, <load operands> + + unsigned LoadDefRegister = MI.getOperand(0).getReg(); + MCSymbol *HandlerLabel = MI.getOperand(1).getMCSymbol(); + unsigned LoadOpcode = MI.getOperand(2).getImm(); + unsigned LoadOperandsBeginIdx = 3; + + FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel); + + MCInst LoadMI; + LoadMI.setOpcode(LoadOpcode); + LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, + E = MI.operands_end(); + I != E; ++I) + if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I)) + LoadMI.addOperand(MaybeOperand.getValue()); + + OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo()); +} // Lower a stackmap of the form: // <id>, <shadowBytes>, ... @@ -1120,6 +1141,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case TargetOpcode::STATEPOINT: return LowerSTATEPOINT(*MI, MCInstLowering); + case TargetOpcode::FAULTING_LOAD_OP: + return LowerFAULTING_LOAD_OP(*MI, MCInstLowering); + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*MI); diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index d598b55..342d26a 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -179,6 +179,6 @@ public: } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 143e70b..33aa78f 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -84,7 +84,7 @@ namespace { }; char PadShortFunc::ID = 0; -} +} // namespace FunctionPass *llvm::createX86PadShortFunctions() { return new PadShortFunc(); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index e9b6bfc..00e2134 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -419,6 +419,22 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { + // Check if the EFLAGS register is marked as live-out. This shouldn't happen, + // because the calling convention defines the EFLAGS register as NOT + // preserved. + // + // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding + // an assert to track this and clear the register afterwards to avoid + // unnecessary crashes during release builds. + assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) && + "EFLAGS are not live-out from a patchpoint."); + + // Also clean other registers that don't need preserving (IP). + for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP}) + Mask[Reg / 32] &= ~(1U << (Reg % 32)); +} + //===----------------------------------------------------------------------===// // Stack Frame Processing methods //===----------------------------------------------------------------------===// @@ -765,4 +781,4 @@ unsigned get512BitSuperRegister(unsigned Reg) { llvm_unreachable("Unexpected SIMD register"); } -} +} // namespace llvm diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index a714c2a..459ecf7 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -104,6 +104,8 @@ public: /// register scavenger to determine what registers are free. BitVector getReservedRegs(const MachineFunction &MF) const override; + void adjustStackMapLiveOutMask(uint32_t *Mask) const override; + bool hasBasePointer(const MachineFunction &MF) const; bool canRealignStack(const MachineFunction &MF) const; @@ -134,6 +136,6 @@ unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false) //get512BitRegister - X86 utility - returns 512-bit super register unsigned get512BitSuperRegister(unsigned Reg); -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h index eb7e0ed..25606d3 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -48,6 +48,6 @@ public: MachinePointerInfo SrcPtrInfo) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 74af29f..3b25d30 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -287,7 +287,7 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, +X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const X86TargetMachine &TM, unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), @@ -300,8 +300,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, TargetTriple.getEnvironment() == Triple::CODE16), TSInfo(*TM.getDataLayout()), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), - is64Bit() ? -8 : -4) { + FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (TM.getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index a476f7a..6934061 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -253,9 +253,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, const X86TargetMachine &TM, - unsigned StackAlignOverride); + X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const X86TargetMachine &TM, unsigned StackAlignOverride); const X86TargetLowering *getTargetLowering() const override { return &TLInfo; @@ -491,6 +490,6 @@ public: } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 646cff7..3d6eb4f7 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -24,6 +24,10 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); @@ -90,13 +94,14 @@ static std::string computeDataLayout(const Triple &TT) { /// X86TargetMachine ctor - Create an X86 target. /// -X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, +X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, computeDataLayout(Triple(TT)), TT, CPU, FS, Options, - RM, CM, OL), - TLOF(createTLOF(Triple(getTargetTriple()))), + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OL), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. @@ -213,7 +218,7 @@ bool X86PassConfig::addInstSelector() { addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); // For ELF, cleanup any local-dynamic TLS accesses. - if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && + if (TM->getTargetTriple().isOSBinFormatELF() && getOptLevel() != CodeGenOpt::None) addPass(createCleanupLocalDynamicTLSPass()); @@ -224,12 +229,14 @@ bool X86PassConfig::addInstSelector() { bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); return true; } bool X86PassConfig::addPreISel() { // Only add this pass for 32-bit x86 Windows. - Triple TT(TM->getTargetTriple()); + const Triple &TT = TM->getTargetTriple(); if (TT.isOSWindows() && TT.getArch() == Triple::x86) addPass(createX86WinEHStatePass()); return true; diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index c9833ed..be56888 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -29,8 +29,8 @@ class X86TargetMachine final : public LLVMTargetMachine { mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; public: - X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, + X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; const X86Subtarget *getSubtargetImpl(const Function &F) const override; @@ -44,6 +44,6 @@ public: } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index bbfeba8..13384fa 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -153,13 +153,13 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, - { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. + { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. @@ -253,19 +253,19 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // to ISel. The cost model must return worst case assumptions because it is // used for vectorization and we don't want to make vectorized code worse // than scalar code. - { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. - { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. - { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. - { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. - { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. - { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 6925b27..71ce45b 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -86,7 +86,7 @@ namespace { }; char VZeroUpperInserter::ID = 0; -} +} // namespace FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp index ce69ea7..c9e8094 100644 --- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -60,9 +60,10 @@ public: private: void emitExceptionRegistrationRecord(Function *F); - void linkExceptionRegistration(IRBuilder<> &Builder, Value *Handler); + void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler); void unlinkExceptionRegistration(IRBuilder<> &Builder); void addCXXStateStores(Function &F, MachineModuleInfo &MMI); + void addSEHStateStores(Function &F, MachineModuleInfo &MMI); void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo, Function &F, int BaseState); void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State); @@ -104,7 +105,7 @@ private: /// The linked list node subobject inside of RegNode. Value *Link = nullptr; }; -} +} // namespace FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } @@ -145,16 +146,10 @@ bool WinEHStatePass::runOnFunction(Function &F) { return false; // Check the personality. Do nothing if this is not an MSVC personality. - LandingPadInst *LP = nullptr; - for (BasicBlock &BB : F) { - LP = BB.getLandingPadInst(); - if (LP) - break; - } - if (!LP) + if (!F.hasPersonalityFn()) return false; PersonalityFn = - dyn_cast<Function>(LP->getPersonalityFn()->stripPointerCasts()); + dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts()); if (!PersonalityFn) return false; Personality = classifyEHPersonality(PersonalityFn); @@ -171,8 +166,10 @@ bool WinEHStatePass::runOnFunction(Function &F) { auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>(); assert(MMIPtr && "MachineModuleInfo should always be available"); MachineModuleInfo &MMI = *MMIPtr; - if (Personality == EHPersonality::MSVC_CXX) { - addCXXStateStores(F, MMI); + switch (Personality) { + default: llvm_unreachable("unexpected personality function"); + case EHPersonality::MSVC_CXX: addCXXStateStores(F, MMI); break; + case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break; } // Reset per-function state. @@ -258,7 +255,6 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { if (Personality == EHPersonality::MSVC_CXX) { RegNodeTy = getCXXEHRegistrationType(); RegNode = Builder.CreateAlloca(RegNodeTy); - // FIXME: We can skip this in -GS- mode, when we figure that out. // SavedESP = llvm.stacksave() Value *SP = Builder.CreateCall( Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {}); @@ -360,11 +356,14 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { } void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder, - Value *Handler) { + Function *Handler) { + // Emit the .safeseh directive for this function. + Handler->addFnAttr("safeseh"); + Type *LinkTy = getEHLinkRegistrationType(); // Handler = Handler - Handler = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy()); - Builder.CreateStore(Handler, Builder.CreateStructGEP(LinkTy, Link, 1)); + Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy()); + Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1)); // Next = [fs:00] Constant *FSZero = Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); @@ -472,6 +471,74 @@ void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode, } } +/// Assign every distinct landingpad a unique state number for SEH. Unlike C++ +/// EH, we can use this very simple algorithm while C++ EH cannot because catch +/// handlers aren't outlined and the runtime doesn't have to figure out which +/// catch handler frame to unwind to. +/// FIXME: __finally blocks are outlined, so this approach may break down there. +void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { + WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); + + // Remember and return the index that we used. We save it in WinEHFuncInfo so + // that we can lower llvm.x86.seh.exceptioninfo later in filter functions + // without too much trouble. + int RegNodeEscapeIndex = escapeRegNode(F); + FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; + + // Iterate all the instructions and emit state number stores. + int CurState = 0; + SmallPtrSet<BasicBlock *, 4> ExceptBlocks; + for (BasicBlock &BB : F) { + for (auto I = BB.begin(), E = BB.end(); I != E; ++I) { + if (auto *CI = dyn_cast<CallInst>(I)) { + auto *Intrin = dyn_cast<IntrinsicInst>(CI); + if (Intrin) { + // Calls that "don't throw" are considered to be able to throw asynch + // exceptions, but intrinsics cannot. + continue; + } + insertStateNumberStore(RegNode, CI, -1); + } else if (auto *II = dyn_cast<InvokeInst>(I)) { + // Look up the state number of the landingpad this unwinds to. + LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); + auto InsertionPair = + FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState)); + auto Iter = InsertionPair.first; + int &State = Iter->second; + bool Inserted = InsertionPair.second; + if (Inserted) { + // Each action consumes a state number. + auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode()); + SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; + parseEHActions(EHActions, ActionList); + assert(!ActionList.empty()); + CurState += ActionList.size(); + State += ActionList.size() - 1; + + // Remember all the __except block targets. + for (auto &Handler : ActionList) { + if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) { + auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc()); + ExceptBlocks.insert(BA->getBasicBlock()); + } + } + } + insertStateNumberStore(RegNode, II, State); + } + } + } + + // Insert llvm.stackrestore into each __except block. + Function *StackRestore = + Intrinsic::getDeclaration(TheModule, Intrinsic::stackrestore); + for (BasicBlock *ExceptBB : ExceptBlocks) { + IRBuilder<> Builder(ExceptBB->begin()); + Value *SP = + Builder.CreateLoad(Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + Builder.CreateCall(StackRestore, {SP}); + } +} + void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State) { IRBuilder<> Builder(IP); diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 2e44ac9..e1baeac 100644 --- a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -40,7 +40,7 @@ public: raw_ostream &VStream, raw_ostream &CStream) const override; }; -} +} // namespace static bool readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address, uint64_t &Size, uint16_t &Insn) { diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index f0e4596..8699ce8 100644 --- a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -46,8 +46,8 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) { return X; } -static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { +static MCSubtargetInfo * +createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); InitXCoreMCSubtargetInfo(X, TT, CPU, FS); return X; @@ -123,7 +123,7 @@ void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) { void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) { OS << "\t.cc_bottom " << Name << ".function\n"; } -} +} // namespace static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS, diff --git a/contrib/llvm/lib/Target/XCore/XCore.h b/contrib/llvm/lib/Target/XCore/XCore.h index ba6ca84..eb8b5ec 100644 --- a/contrib/llvm/lib/Target/XCore/XCore.h +++ b/contrib/llvm/lib/Target/XCore/XCore.h @@ -32,6 +32,6 @@ namespace llvm { CodeGenOpt::Level OptLevel); ModulePass *createXCoreLowerThreadLocalPass(); -} // end namespace llvm; +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h index 607c772..116e89a 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h +++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h @@ -58,6 +58,6 @@ namespace llvm { return 4; } }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp index 77292c4..8d96105 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp @@ -34,7 +34,7 @@ namespace { } }; char XCoreFTAOElim::ID = 0; -} +} // namespace /// createXCoreFrameToArgsOffsetEliminationPass - returns an instance of the /// Frame to args offset elimination pass diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h index 97f0494..9c49a8d 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h +++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h @@ -85,7 +85,7 @@ namespace llvm { // Memory barrier. MEMBARRIER }; - } + } // namespace XCoreISD //===--------------------------------------------------------------------===// // TargetLowering Implementation @@ -215,6 +215,6 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &ArgsFlags, LLVMContext &Context) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index c310aa3..a6e974e 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -41,7 +41,7 @@ namespace XCore { COND_INVALID }; } -} +} // namespace llvm // Pin the vtable to this file. void XCoreInstrInfo::anchor() {} @@ -281,7 +281,7 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, unsigned XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + ArrayRef<MachineOperand> Cond, DebugLoc DL)const{ // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h index 60bb3f8..70beb41 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h @@ -56,8 +56,7 @@ public: bool AllowModify) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; @@ -89,6 +88,6 @@ public: unsigned Reg, uint64_t Value) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 996c6f5..f866ab0 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -50,7 +50,7 @@ namespace { bool runOnModule(Module &M) override; }; -} +} // namespace char XCoreLowerThreadLocal::ID = 0; diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h index 5691478..74a7f20 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h +++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h @@ -37,6 +37,6 @@ private: MCOperand LowerSymbolOperand(const MachineOperand &MO, MachineOperandType MOTy, unsigned Offset) const; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h index 078ffde..8cce75f 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h @@ -101,6 +101,6 @@ public: return SpillLabels; } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h index cfd80b3..6224843 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -35,6 +35,6 @@ public: MachinePointerInfo SrcPtrInfo) const override; }; -} +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp index 7996020..c98518b 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp @@ -25,7 +25,7 @@ using namespace llvm; void XCoreSubtarget::anchor() { } -XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU, +XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this), TLInfo(TM, *this), TSInfo(*TM.getDataLayout()) {} diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h index da51ef1..74ee594 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h +++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h @@ -40,9 +40,9 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - XCoreSubtarget(const std::string &TT, const std::string &CPU, + XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM); - + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); @@ -61,6 +61,6 @@ public: return &InstrInfo.getRegisterInfo(); } }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp index 228dc1c..370b64b 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp @@ -22,7 +22,7 @@ using namespace llvm; /// XCoreTargetMachine ctor - Create an ILP32 architecture model /// -XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT, +XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h index 0d324ab..a8addfc 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h @@ -23,8 +23,8 @@ class XCoreTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; XCoreSubtarget Subtarget; public: - XCoreTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, + XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~XCoreTargetMachine() override; diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h index 3563dbc..a82702f 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h @@ -22,6 +22,6 @@ public: virtual void emitCCBottomData(StringRef Name) = 0; virtual void emitCCBottomFunction(StringRef Name) = 0; }; -} +} // namespace llvm #endif |