diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86FastISel.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86FastISel.cpp | 331 |
1 files changed, 240 insertions, 91 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index dfe3c80..c890fdd 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -170,6 +170,12 @@ private: const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, X86AddressMode &AM); + + unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, unsigned Op0, + bool Op0IsKill, unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill, unsigned Op3, + bool Op3IsKill); }; } // end anonymous namespace. @@ -182,18 +188,18 @@ getX86ConditionCode(CmpInst::Predicate Predicate) { default: break; // Floating-point Predicates case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; - case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGT: CC = X86::COND_A; break; - case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; - case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULT: CC = X86::COND_B; break; - case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; case CmpInst::FCMP_UNO: CC = X86::COND_P; break; case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; - case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; // Integer Predicates @@ -229,15 +235,15 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) { switch (Predicate) { default: llvm_unreachable("Unexpected predicate"); case CmpInst::FCMP_OEQ: CC = 0; break; - case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLT: CC = 1; break; - case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLE: CC = 2; break; case CmpInst::FCMP_UNO: CC = 3; break; case CmpInst::FCMP_UNE: CC = 4; break; - case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGE: CC = 5; break; - case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGT: CC = 6; break; case CmpInst::FCMP_ORD: CC = 7; break; case CmpInst::FCMP_UEQ: @@ -351,6 +357,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given load instruction. @@ -378,7 +386,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f32: if (X86ScalarSSEf32) { - Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; @@ -387,7 +395,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; + Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; @@ -399,20 +407,26 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, return false; case MVT::v4f32: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; + Opc = HasVLX ? X86::VMOVAPSZ128rm : + HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; else - Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; + Opc = HasVLX ? X86::VMOVUPSZ128rm : + HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; RC = &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; + Opc = HasVLX ? X86::VMOVAPDZ128rm : + HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; else - Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; + Opc = HasVLX ? X86::VMOVUPDZ128rm : + HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; RC = &X86::VR128RegClass; break; case MVT::v4i32: @@ -420,27 +434,34 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v8i16: case MVT::v16i8: if (IsNonTemporal && Alignment >= 16) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; + Opc = HasVLX ? X86::VMOVDQA64Z128rm : + HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; else - Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; + Opc = HasVLX ? X86::VMOVDQU64Z128rm : + HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; RC = &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) - Opc = X86::VMOVNTDQAYrm; + Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm; + Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; RC = &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm; + Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; RC = &X86::VR256RegClass; break; case MVT::v8i32: @@ -450,12 +471,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else - Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm; + Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; RC = &X86::VR256RegClass; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -463,7 +486,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, RC = &X86::VR512RegClass; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -474,7 +497,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the load isn't masked. if (IsNonTemporal && Alignment >= 64) @@ -504,6 +527,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE4A = Subtarget->hasSSE4A(); bool HasAVX = Subtarget->hasAVX(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given store instruction. @@ -518,8 +543,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, TII.get(X86::AND8ri), AndResult) .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; + LLVM_FALLTHROUGH; // handle i1 as i8. } - // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; case MVT::i32: @@ -534,7 +559,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else - Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Opc = HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; } else Opc = X86::ST_Fp32m; break; @@ -543,27 +569,34 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else - Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + Opc = HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; } else Opc = X86::ST_Fp64m; break; case MVT::v4f32: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + Opc = HasVLX ? X86::VMOVNTPSZ128mr : + HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; else - Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + Opc = HasVLX ? X86::VMOVAPSZ128mr : + HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; } else - Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; + Opc = HasVLX ? X86::VMOVUPSZ128mr : + HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + Opc = HasVLX ? X86::VMOVNTPDZ128mr : + HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; else - Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + Opc = HasVLX ? X86::VMOVAPDZ128mr : + HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; } else - Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; + Opc = HasVLX ? X86::VMOVUPDZ128mr : + HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: @@ -571,45 +604,57 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i8: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + Opc = HasVLX ? X86::VMOVNTDQZ128mr : + HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; else - Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + Opc = HasVLX ? X86::VMOVDQA64Z128mr : + HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; } else - Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; + Opc = HasVLX ? X86::VMOVDQU64Z128mr : + HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; break; case MVT::v8f32: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr; - else - Opc = X86::VMOVUPSYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr; + else + Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr; + } else + Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr; break; case MVT::v4f64: assert(HasAVX); if (Aligned) { - Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr; + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr; + else + Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr; } else - Opc = X86::VMOVUPDYmr; + Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr; break; case MVT::v8i32: case MVT::v4i64: case MVT::v16i16: case MVT::v32i8: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr; - else - Opc = X86::VMOVDQUYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr; + else + Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr; + } else + Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr; else Opc = X86::VMOVUPSZmr; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) { Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr; } else @@ -619,7 +664,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the store isn't masked. if (Aligned) @@ -659,7 +704,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, bool Signed = true; switch (VT.getSimpleVT().SimpleTy) { default: break; - case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i1: + Signed = false; + LLVM_FALLTHROUGH; // Handle as i8. case MVT::i8: Opc = X86::MOV8mi; break; case MVT::i16: Opc = X86::MOV16mi; break; case MVT::i32: Opc = X86::MOV32mi; break; @@ -895,7 +942,7 @@ redo_gep: for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i, ++GTI) { const Value *Op = *i; - if (StructType *STy = dyn_cast<StructType>(*GTI)) { + if (StructType *STy = GTI.getStructTypeOrNull()) { const StructLayout *SL = DL.getStructLayout(STy); Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); continue; @@ -1454,11 +1501,11 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. - static unsigned SETFOpcTable[2][3] = { + static const uint16_t SETFOpcTable[2][3] = { { X86::SETEr, X86::SETNPr, X86::AND8rr }, { X86::SETNEr, X86::SETPr, X86::OR8rr } }; - unsigned *SETFOpc = nullptr; + const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; @@ -1511,7 +1558,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); - if (SrcVT.SimpleTy == MVT::i1) { + if (SrcVT == MVT::i1) { // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; @@ -1601,7 +1648,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: - std::swap(TrueMBB, FalseMBB); // fall-through + std::swap(TrueMBB, FalseMBB); + LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: NeedExtraBranch = true; Predicate = CmpInst::FCMP_ONE; @@ -1651,6 +1699,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (TestOpc) { unsigned OpReg = getRegForValue(TI->getOperand(0)); if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); @@ -1688,8 +1737,17 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { + unsigned KOpReg = OpReg; + OpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), OpReg) + .addReg(KOpReg); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(OpReg).addImm(1); + .addReg(OpReg) + .addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); @@ -1875,15 +1933,15 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { // Copy the zero into the appropriate sub/super/identical physical // register. Unfortunately the operations needed are not uniform enough // to fit neatly into the table above. - if (VT.SimpleTy == MVT::i16) { + if (VT == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32, 0, X86::sub_16bit); - } else if (VT.SimpleTy == MVT::i32) { + } else if (VT == MVT::i32) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32); - } else if (VT.SimpleTy == MVT::i64) { + } else if (VT == MVT::i64) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); @@ -1953,11 +2011,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. - static unsigned SETFOpcTable[2][3] = { + static const uint16_t SETFOpcTable[2][3] = { { X86::SETNPr, X86::SETEr , X86::TEST8rr }, { X86::SETPr, X86::SETNEr, X86::OR8rr } }; - unsigned *SETFOpc = nullptr; + const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: @@ -2023,8 +2081,17 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; bool CondIsKill = hasTrivialKill(Cond); + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { + unsigned KCondReg = CondReg; + CondReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CondReg) + .addReg(KCondReg, getKillRegState(CondIsKill)); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(1); } const Value *LHS = I->getOperand(1); @@ -2087,12 +2154,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { std::swap(CmpLHS, CmpRHS); // Choose the SSE instruction sequence based on data type (float or double). - static unsigned OpcTable[2][4] = { - { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } + static const uint16_t OpcTable[2][4] = { + { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, + { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } }; - unsigned *Opc = nullptr; + const uint16_t *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; case MVT::f32: Opc = &OpcTable[0][0]; break; @@ -2119,9 +2186,36 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg; - - if (Subtarget->hasAVX()) { - const TargetRegisterClass *FR32 = &X86::FR32RegClass; + + if (Subtarget->hasAVX512()) { + // If we have AVX512 we can use a mask compare and masked movss/sd. + const TargetRegisterClass *VR128X = &X86::VR128XRegClass; + const TargetRegisterClass *VK1 = &X86::VK1RegClass; + + unsigned CmpOpcode = + (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr; + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + + // Need an IMPLICIT_DEF for the input that is used to generate the upper + // bits of the result register since its not based on any of the inputs. + unsigned ImplicitDefReg = createResultReg(VR128X); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + + // Place RHSReg is the passthru of the masked movss/sd operation and put + // LHS in the input. The mask input comes from the compare. + unsigned MovOpcode = + (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk; + unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill, + CmpReg, true, ImplicitDefReg, true, + LHSReg, LHSIsKill); + + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg); + + } else if (Subtarget->hasAVX()) { const TargetRegisterClass *VR128 = &X86::VR128RegClass; // If we have AVX, create 1 blendv instead of 3 logic instructions. @@ -2130,11 +2224,11 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. unsigned CmpOpcode = - (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; unsigned BlendOpcode = - (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, + (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CmpReg, true); @@ -2142,14 +2236,18 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { + const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); } updateValueMap(I, ResultReg); return true; @@ -2195,8 +2293,18 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); + + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { + unsigned KCondReg = CondReg; + CondReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CondReg) + .addReg(KCondReg, getKillRegState(CondIsKill)); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(1); } const Value *LHS = I->getOperand(1); @@ -2522,8 +2630,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // This needs to be set before we call getPtrSizedFrameRegister, otherwise // we get the wrong frame register. - MachineFrameInfo *MFI = MF->getFrameInfo(); - MFI->setFrameAddressIsTaken(true); + MachineFrameInfo &MFI = MF->getFrameInfo(); + MFI.setFrameAddressIsTaken(true); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); @@ -2698,7 +2806,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { const Function *Callee = II->getCalledFunction(); auto *Ty = cast<StructType>(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); - Type *CondTy = Ty->getTypeAtIndex(1); + assert(Ty->getTypeAtIndex(1)->isIntegerTy() && + Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 && + "Overflow value expected to be an i1"); MVT VT; if (!isTypeLegal(RetTy, VT)) @@ -2808,7 +2918,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { if (!ResultReg) return false; - unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + // Assign to a GPR since the overflow return value is lowered to a SETcc. + unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); @@ -2966,7 +3077,7 @@ bool X86FastISel::fastLowerArguments() { default: llvm_unreachable("Unexpected value type."); case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; - case MVT::f32: // fall-through + case MVT::f32: LLVM_FALLTHROUGH; case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); @@ -3140,7 +3251,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - if (ArgVT.SimpleTy == MVT::i1) + if (ArgVT == MVT::i1) return false; bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, @@ -3154,7 +3265,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { "Unexpected extend"); // Handle zero-extension from i1 to i8, which is common. - if (ArgVT.SimpleTy == MVT::i1) { + if (ArgVT == MVT::i1) { // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; @@ -3456,8 +3567,14 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { if (!SrcVT.isSimple() || !DstVT.isSimple()) return false; - if (!SrcVT.is128BitVector() && - !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + MVT SVT = SrcVT.getSimpleVT(); + MVT DVT = DstVT.getSimpleVT(); + + if (!SVT.is128BitVector() && + !(Subtarget->hasAVX() && SVT.is256BitVector()) && + !(Subtarget->hasAVX512() && SVT.is512BitVector() && + (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 && + DVT.getScalarSizeInBits() >= 32)))) return false; unsigned Reg = getRegForValue(I->getOperand(0)); @@ -3505,7 +3622,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { unsigned Opc = 0; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); - case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i1: VT = MVT::i8; LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8ri; break; case MVT::i16: Opc = X86::MOV16ri; break; case MVT::i32: Opc = X86::MOV32ri; break; @@ -3775,6 +3892,38 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, return true; } +unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill, + unsigned Op3, bool Op3IsKill) { + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + unsigned ResultReg = createResultReg(RC); + Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); + Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); + Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); + Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3); + + if (II.getNumDefs() >= 1) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op1, getKillRegState(Op1IsKill)) + .addReg(Op2, getKillRegState(Op2IsKill)) + .addReg(Op3, getKillRegState(Op3IsKill)); + else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op1, getKillRegState(Op1IsKill)) + .addReg(Op2, getKillRegState(Op2IsKill)) + .addReg(Op3, getKillRegState(Op3IsKill)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); + } + return ResultReg; +} + namespace llvm { FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, |