diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86InstrInfo.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86InstrInfo.cpp | 3727 |
1 files changed, 2636 insertions, 1091 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 5f0aab9..627b612 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -68,7 +68,7 @@ static cl::opt<unsigned> UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), - cl::init(64), cl::Hidden); + cl::init(128), cl::Hidden); enum { // Select which memory operand is being unfolded. @@ -228,12 +228,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SBB64ri32, X86::SBB64mi32, 0 }, { X86::SBB64ri8, X86::SBB64mi8, 0 }, { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SHL16r1, X86::SHL16m1, 0 }, { X86::SHL16rCL, X86::SHL16mCL, 0 }, { X86::SHL16ri, X86::SHL16mi, 0 }, + { X86::SHL32r1, X86::SHL32m1, 0 }, { X86::SHL32rCL, X86::SHL32mCL, 0 }, { X86::SHL32ri, X86::SHL32mi, 0 }, + { X86::SHL64r1, X86::SHL64m1, 0 }, { X86::SHL64rCL, X86::SHL64mCL, 0 }, { X86::SHL64ri, X86::SHL64mi, 0 }, + { X86::SHL8r1, X86::SHL8m1, 0 }, { X86::SHL8rCL, X86::SHL8mCL, 0 }, { X86::SHL8ri, X86::SHL8mi, 0 }, { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, @@ -335,6 +339,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, @@ -380,6 +385,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, @@ -394,10 +400,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions + { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, @@ -409,8 +425,27 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE }, + { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (256-bit versions) + { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -421,6 +456,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (128-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -471,26 +515,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, - { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, - { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, - { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, - { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, - { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, - { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, - { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, + { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE }, + { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, - { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, - { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, - { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, - { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, - { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, - { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE }, { X86::MOV16rr, X86::MOV16rm, 0 }, { X86::MOV32rr, X86::MOV32rm, 0 }, { X86::MOV64rr, X86::MOV64rm, 0 }, @@ -499,10 +543,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, - { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, + { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, @@ -511,51 +556,53 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, - { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, + { X86::MOVUPDrr, X86::MOVUPDrm, 0 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, - { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, - { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 }, + { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, + { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, - { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 }, - { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 }, - { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 }, - { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 }, - { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 }, - { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 }, - { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 }, - { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 }, - { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 }, - { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 }, - { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 }, - { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 }, + { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, + { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, + { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE }, + { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE }, + { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE }, + { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE }, + { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE }, + { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE }, + { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE }, + { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE }, + { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE }, + { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, { X86::RCPSSr, X86::RCPSSm, 0 }, - { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 }, + { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE }, { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, + { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, + { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, - { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE }, { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, { X86::SQRTSDr, X86::SQRTSDm, 0 }, - { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE }, { X86::SQRTSSr, X86::SQRTSSm, 0 }, - { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE }, { X86::TEST16rr, X86::TEST16rm, 0 }, { X86::TEST32rr, X86::TEST32rm, 0 }, { X86::TEST64rr, X86::TEST64rm, 0 }, @@ -586,46 +633,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, // AVX 128-bit versions of foldable instructions - { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, - { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, - { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, - { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE }, + { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE }, + { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE }, + { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE }, { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE }, { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE }, { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE }, { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, - { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, - { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, - { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, + { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, - { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, - { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, + { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 }, + { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, - { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, - { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, + { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 }, { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, - { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, - { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, - { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE }, + { X86::VPABSBrr, X86::VPABSBrm, 0 }, + { X86::VPABSDrr, X86::VPABSDrm, 0 }, + { X86::VPABSWrr, X86::VPABSWrm, 0 }, { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, @@ -633,18 +681,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, - { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 }, - { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 }, - { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 }, - { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 }, - { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 }, - { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 }, - { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 }, - { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 }, - { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 }, - { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 }, - { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 }, - { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 }, + { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, + { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE }, + { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE }, + { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE }, + { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE }, + { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE }, + { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE }, + { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE }, + { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE }, + { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE }, + { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE }, + { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE }, { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, @@ -661,18 +709,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, // AVX 256-bit foldable instructions - { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, - { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 }, { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, @@ -699,31 +748,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, - { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, - { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, - { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 }, - { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 }, - { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 }, - { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 }, - { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 }, - { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 }, - { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 }, - { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 }, + { X86::VPABSBYrr, X86::VPABSBYrm, 0 }, + { X86::VPABSDYrr, X86::VPABSDYrm, 0 }, + { X86::VPABSWYrr, X86::VPABSWYrm, 0 }, + { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE }, + { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE }, + { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE }, + { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE }, + { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE }, { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, { X86::VPERMQYri, X86::VPERMQYmi, 0 }, - { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 }, - { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 }, + { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE }, + { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE }, { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, - { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 }, - { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 }, - { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 }, + { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE }, + { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE }, + { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE }, { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, - { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 }, + { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE }, { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, @@ -817,7 +866,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, // AVX-512 foldable instructions + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, + { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, @@ -831,12 +885,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, + { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, + { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPERMQZri, X86::VPERMQZmi, 0 }, + { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 }, + { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 }, + { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 }, + { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 }, + { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 }, + { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 }, + { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 }, + { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 }, + { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 }, + { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 }, + { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, + { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, + { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, // AVX-512 foldable instructions (256-bit versions) + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, @@ -847,12 +920,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, + { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, + { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, + { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 }, + { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 }, + { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 }, + { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 }, + { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 }, + { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 }, + { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 }, + { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, + { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, + { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, // AVX-512 foldable instructions (128-bit versions) + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, @@ -863,8 +953,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, + { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, + { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, + { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, + { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, + // F16C foldable instructions { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, @@ -896,9 +1002,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, - { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, - { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, @@ -970,24 +1076,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, - { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, - { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE }, { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, - - // Do not fold Fs* scalar logical op loads because there are no scalar - // load variants for these instructions. When folded, the load is required - // to be 128-bits, so the load size would not match. - - { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, - { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, - { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, - { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, - { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, - { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, - { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, - { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -995,34 +1088,42 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL16rr, X86::IMUL16rm, 0 }, { X86::IMUL32rr, X86::IMUL32rm, 0 }, { X86::IMUL64rr, X86::IMUL64rm, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, - { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE }, { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, - { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE }, { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, + { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, + { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, - { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, + { X86::MAXCSDrr, X86::MAXCSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, - { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, + { X86::MAXCSSrr, X86::MAXCSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, + { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, + { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, - { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, + { X86::MINCSDrr, X86::MINCSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE }, { X86::MINSSrr, X86::MINSSrm, 0 }, - { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MINCSSrr, X86::MINCSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE }, { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, - { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE }, { X86::MULSSrr, X86::MULSSrm, 0 }, - { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, @@ -1067,7 +1168,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PINSRDrr, X86::PINSRDrm, 0 }, { X86::PINSRQrr, X86::PINSRQrm, 0 }, { X86::PINSRWrri, X86::PINSRWrmi, 0 }, - { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, + { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, @@ -1082,7 +1183,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, - { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, + { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, @@ -1119,8 +1220,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, - { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, - { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, + { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE }, + { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE }, { X86::SBB32rr, X86::SBB32rm, 0 }, { X86::SBB64rr, X86::SBB64rm, 0 }, { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, @@ -1132,9 +1233,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, - { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, - { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, @@ -1240,7 +1341,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX 128-bit versions of foldable instructions { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, - { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, + { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE }, { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, @@ -1250,21 +1351,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, - { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VRCPSSr, X86::VRCPSSm, 0 }, - { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 }, - { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, - { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 }, - { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, - { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 }, - { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, - { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 }, + { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, - { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, - { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, @@ -1282,48 +1375,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, - { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE }, { X86::VDPPDrri, X86::VDPPDrmi, 0 }, { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - // Do not fold VFs* loads because there are no scalar load variants for - // these instructions. When folded, the load is required to be 128-bits, so - // the load size would not match. - { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, - { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, - { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, - { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, - { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, - { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, - { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, - { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, - { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, - { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, + { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE }, + { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE }, + { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, + { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, + { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, + { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, - { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, - { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE }, + { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, + { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, + { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, + { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, - { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, - { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE }, { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, - { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, - { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, @@ -1366,7 +1456,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, - { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, + { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, @@ -1381,7 +1471,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, - { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, + { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 }, { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, @@ -1418,16 +1508,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, + { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE }, + { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE }, { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, + { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE }, { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, + { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE }, { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, + { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE }, + { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, - { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, - { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, @@ -1458,8 +1558,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, + { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 }, + { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, + { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 }, + { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, @@ -1520,7 +1624,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, - { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, + { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 }, { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, @@ -1536,7 +1640,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, - { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, + { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 }, { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, @@ -1559,8 +1663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, - { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 }, - { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 }, { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, @@ -1588,37 +1690,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE }, { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE }, { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, @@ -1678,38 +1788,107 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADOX64rr, X86::ADOX64rm, 0 }, // AVX-512 foldable instructions - { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, - { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, - { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, 0 }, + { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDSDZrr, X86::VADDSDZrm, 0 }, - { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, 0 }, - { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, - { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, - { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, - { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, 0 }, - { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, - { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, 0 }, - { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, - { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, - { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, - { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, 0 }, - { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, - { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, 0 }, - { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, + { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, + { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, + { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, + { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, + { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, + { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 }, + { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 }, + { X86::VANDPDZrr, X86::VANDPDZrm, 0 }, + { X86::VANDPSZrr, X86::VANDPSZrm, 0 }, + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, + { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 }, + { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 }, + { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, + { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, + { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, - { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, - { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, 0 }, + { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, - { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, 0 }, - { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, - { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, - { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, + { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, + { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, + { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 }, + { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 }, + { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 }, + { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 }, + { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 }, + { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 }, + { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 }, + { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 }, + { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, + { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 }, + { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, + { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, + { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 }, + { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, + { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, + { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, + { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, + { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 }, + { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, + { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, + { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, + { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, + { X86::VMINSDZrr, X86::VMINSDZrm, 0 }, + { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, + { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, + { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, + { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, + { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, + { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, + { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, + { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, + { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, + { X86::VORPDZrr, X86::VORPDZrm, 0 }, + { X86::VORPSZrr, X86::VORPSZrm, 0 }, + { X86::VPADDBZrr, X86::VPADDBZrm, 0 }, { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, - { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 }, + { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 }, + { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 }, + { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 }, + { X86::VPADDWZrr, X86::VPADDWZrm, 0 }, + { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 }, + { X86::VPANDDZrr, X86::VPANDDZrm, 0 }, + { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 }, + { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 }, + { X86::VPANDQZrr, X86::VPANDQZrm, 0 }, + { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 }, + { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 }, + { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 }, + { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 }, + { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 }, + { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 }, + { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 }, + { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 }, + { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 }, + { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 }, + { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 }, + { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 }, + { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 }, + { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 }, + { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 }, + { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 }, + { X86::VPERMBZrr, X86::VPERMBZrm, 0 }, + { X86::VPERMDZrr, X86::VPERMDZrm, 0 }, + { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 }, + { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 }, + { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 }, { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, + { X86::VPERMWZrr, X86::VPERMWZrm, 0 }, + { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 }, + { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 }, { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, @@ -1719,31 +1898,297 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, + { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, + { X86::VPORDZrr, X86::VPORDZrm, 0 }, + { X86::VPORQZrr, X86::VPORQZrm, 0 }, + { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 }, { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, + { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 }, + { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 }, + { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 }, + { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 }, + { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 }, + { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 }, + { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 }, + { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 }, + { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 }, + { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 }, + { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 }, + { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 }, + { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, + { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, + { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, - { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, - { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, - - // AVX-512{F,VL} foldable instructions - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, + { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, + { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, + { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, + { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, + { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, + { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 }, + { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 }, + { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 }, + { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 }, + { X86::VXORPDZrr, X86::VXORPDZrm, 0 }, + { X86::VXORPSZrr, X86::VXORPSZrm, 0 }, // AVX-512{F,VL} foldable instructions { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, + { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 }, + { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 }, + { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 }, + { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 }, + { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 }, + { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 }, + { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 }, + { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 }, + { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 }, + { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 }, + { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 }, + { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 }, + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, + { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, + { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, + { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, + { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 }, + { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 }, + { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 }, + { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 }, + { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 }, + { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, + { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, + { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, + { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 }, + { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, + { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, + { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, + { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 }, + { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, + { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, + { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, + { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 }, + { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, + { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, + { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, + { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 }, + { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 }, + { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 }, + { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, + { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 }, + { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 }, + { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 }, + { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 }, + { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 }, + { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 }, + { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 }, + { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 }, + { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 }, + { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 }, + { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 }, + { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 }, + { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 }, + { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 }, + { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 }, + { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 }, + { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 }, + { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 }, + { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 }, + { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 }, + { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 }, + { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 }, + { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 }, + { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 }, + { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 }, + { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 }, + { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 }, + { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 }, + { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 }, + { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 }, + { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 }, + { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 }, + { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 }, + { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 }, + { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 }, + { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 }, + { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 }, + { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 }, + { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 }, + { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 }, + { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 }, + { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 }, + { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 }, + { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 }, + { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 }, + { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 }, + { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 }, + { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 }, + { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 }, + { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 }, + { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 }, + { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 }, + { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 }, + { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 }, + { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 }, + { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 }, + { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 }, + { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 }, + { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 }, + { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 }, + { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 }, + { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 }, + { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 }, + { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, + { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, + { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, + { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, + { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, + { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, + { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, + { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, + { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 }, + { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 }, + { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 }, + { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 }, + { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 }, + { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 }, + { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 }, + { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 }, + { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 }, + { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 }, + { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 }, + { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 }, + { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 }, + { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 }, + { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 }, + { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 }, + { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 }, + { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 }, + { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 }, + { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 }, + { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 }, + { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 }, + { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 }, + { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 }, + { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 }, + { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 }, + { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 }, + { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 }, + { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, + { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 }, + { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 }, + { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 }, + { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 }, + { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 }, + { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 }, + { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 }, + { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 }, + { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 }, + { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 }, + { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 }, + { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, + + // AVX-512 masked foldable instructions + { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, + { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, + { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, + { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 }, + { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 }, + { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 }, + { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 }, + { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 }, + { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 }, + { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 }, + { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 }, + { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 }, + { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 }, + { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 }, + { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, + { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, + { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, + + // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, + { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, + { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, + { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 }, + { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 }, + { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 }, + { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 }, + { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 }, + { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 }, + { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 }, + { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, + { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, + { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, + + // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, + { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, + { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, + { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, + { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -1773,170 +2218,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) } static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { - // FMA foldable instructions - { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, - { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, - { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, - { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, - { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, - { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, - { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE }, - { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE }, - { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE }, - { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE }, - { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, - - { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, - // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE }, { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE }, { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, @@ -1947,11 +2269,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, - // AVX-512 VPERMI instructions with 3 source operands. - { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, - { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, - { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, - { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + // AVX-512 instructions with 3 source operands. { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, @@ -1961,45 +2279,349 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, - // AVX-512 arithmetic instructions - { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 }, + { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, + { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, + { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, + { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 }, + { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 }, + { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 }, + { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 }, + { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, + { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, + { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, + { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, + { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, + + // AVX-512VL 256-bit instructions with 3 source operands. + { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 }, + { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 }, + { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 }, + { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 }, + { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 }, + { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 }, + { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 }, + { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 }, + { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 }, + { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, + { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, + { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, + { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, + { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, + + // AVX-512VL 128-bit instructions with 3 source operands. + { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 }, + { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 }, + { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 }, + { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 }, + { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 }, + { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 }, + { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 }, + { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 }, + { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 }, + { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, + { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, + { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, + { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, + { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, + + // AVX-512 masked instructions { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, - { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, - { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, - { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, - { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, - { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, + { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, + { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, + { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 }, + { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 }, + { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, - { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, - { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, - { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, + { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, + { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, + { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 }, + { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 }, + { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 }, + { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 }, + { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 }, + { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, + { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, - // AVX-512{F,VL} arithmetic instructions 256-bit - { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, + { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, + { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, + { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, + { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, + { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 }, + { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 }, + { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 }, + { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 }, + { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 }, + { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 }, + { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 }, + { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 }, + { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 }, + { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 }, + { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 }, + { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, + { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, + { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, + { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, + { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 }, + { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 }, + { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 }, + { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 }, + { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 }, + { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, + { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 }, + { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, + { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, + { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, + { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, + { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, + { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, + { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 }, + { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 }, + { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 }, + { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 }, + { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 }, + { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 }, + { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 }, + { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 }, + { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 }, + { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 }, + { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 }, + { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 }, + { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, + { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, + { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, + { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, + { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, + { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, + { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 }, + { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 }, + { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 }, + + // AVX-512{F,VL} masked arithmetic instructions 256-bit { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, - { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, - { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, - { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, - { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, - { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, + { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 }, + { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 }, + { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 }, + { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 }, + { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 }, { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, - { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, - { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, - { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 }, + { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 }, + { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 }, + { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 }, + { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, + { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, - // AVX-512{F,VL} arithmetic instructions 128-bit - { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, + { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, + { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, + { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, + { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, + { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, + { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, + { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, + { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, + { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, + { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, + { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 }, + { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 }, + { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 }, + { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 }, + { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 }, + { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 }, + { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 }, + { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, + { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, + { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, + { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, + { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, + { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, + { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 }, + { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 }, + { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 }, + { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 }, + { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 }, + { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, + { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 }, + { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, + { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, + { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, + { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, + { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, + { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, + { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 }, + { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 }, + { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, + { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, + { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, + { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 }, + { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 }, + { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 }, + { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 }, + { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 }, + { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 }, + { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 }, + { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, + { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, + { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, + { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, + { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, + { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 }, + { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 }, + { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 }, + { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 }, + { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 }, + + // AVX-512{F,VL} masked arithmetic instructions 128-bit { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, - { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, - { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, - { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, - { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, - { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, + { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 }, + { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 }, + { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 }, + { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 }, + { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 }, { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, - { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, - { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, + { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, + { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, - { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } + { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, + { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, + { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, + { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, + { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, + { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, + { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, + { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, + { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, + { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, + { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 }, + { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 }, + { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 }, + { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 }, + { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 }, + { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 }, + { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 }, + { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, + { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, + { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, + { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, + { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, + { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, + { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, + { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, + { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, + { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, + { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, + { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, + { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, + { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, + { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, + { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 }, + { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 }, + { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, + { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, + { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, + { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 }, + { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 }, + { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 }, + { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 }, + { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 }, + { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 }, + { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 }, + { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, + { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, + { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, + { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, + { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, + { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 }, + { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 }, + { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 }, + { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 }, + { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, + + // AVX-512 masked foldable instructions + { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, + { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, + { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, + { X86::VPERMQZrik, X86::VPERMQZmik, 0 }, + { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 }, + { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 }, + { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 }, + { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 }, + { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 }, + { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 }, + { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 }, + { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 }, + { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 }, + { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 }, + { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, + { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, + { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, + + // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, + { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, + { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, + { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 }, + { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 }, + { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 }, + { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 }, + { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 }, + { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 }, + { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 }, + { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, + { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, + { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, + + // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, + { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, + { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, + { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, + { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { @@ -2008,47 +2630,348 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 3, folded load Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } + auto I = X86InstrFMA3Info::rm_begin(); + auto E = X86InstrFMA3Info::rm_end(); + for (; I != E; ++I) { + if (!I.getGroup()->isKMasked()) { + // Intrinsic forms need to pass TB_NO_REVERSE. + if (I.getGroup()->isIntrinsic()) { + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE); + } else { + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD); + } + } + } static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { - // AVX-512 foldable instructions - { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + // AVX-512 foldable masked instructions { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, - { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, - { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, - { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, - { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, - { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, + { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, + { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, + { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 }, + { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 }, + { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, - { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, - { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, - { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, + { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, + { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, + { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 }, + { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 }, + { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 }, + { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 }, + { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 }, + { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, + { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, - // AVX-512{F,VL} foldable instructions 256-bit - { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, + { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, + { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, + { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, + { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, + { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 }, + { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 }, + { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 }, + { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 }, + { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 }, + { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 }, + { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 }, + { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 }, + { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 }, + { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 }, + { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 }, + { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, + { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, + { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, + { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, + { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 }, + { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 }, + { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 }, + { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 }, + { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 }, + { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, + { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, + { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, + { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, + { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, + { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 }, + { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 }, + { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 }, + { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 }, + { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, + { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, + { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, + { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, + { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, + { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, + { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, + { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, + { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, + { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, + { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, + { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 }, + { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 }, + { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 }, + { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 }, + { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 }, + { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, + { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, + { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, + { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, + { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 }, + { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 }, + { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 }, + { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 }, + { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, + { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, + { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, + { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, + { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, + { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, + { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 }, + { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 }, + { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, + + // AVX-512{F,VL} foldable masked instructions 256-bit { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, - { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, - { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, - { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, - { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, - { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, + { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 }, + { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 }, + { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 }, + { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 }, + { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, - { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, - { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, - { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 }, + { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 }, + { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 }, + { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 }, + { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, + { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, + { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, + { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, + { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, + { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, + { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, + { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, + { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, + { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, + { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, + { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, + { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, + { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 }, + { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 }, + { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 }, + { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 }, + { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 }, + { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 }, + { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 }, + { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, + { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, + { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, + { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, + { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, + { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, + { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 }, + { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 }, + { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 }, + { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 }, + { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 }, + { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, + { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, + { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, + { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, + { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, + { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 }, + { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 }, + { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 }, + { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 }, + { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, + { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, + { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, + { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, + { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, + { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, + { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, + { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, + { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, + { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, + { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, + { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 }, + { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 }, + { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 }, + { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, + { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, + { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, + { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, + { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, + { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, + { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, + { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 }, + { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 }, + { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 }, + { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 }, + { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, + { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, + { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, + { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, + { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, + { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 }, + { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 }, + { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 }, + { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 }, + { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 }, + // AVX-512{F,VL} foldable instructions 128-bit - { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, - { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, - { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, - { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, - { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, - { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, + { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, + { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 }, + { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 }, + { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 }, + { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 }, + { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, - { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, - { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, + { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, + { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, - { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } + { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, + { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, + { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, + { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, + { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, + { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, + { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, + { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, + { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, + { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, + { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 }, + { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 }, + { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 }, + { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 }, + { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 }, + { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 }, + { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 }, + { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, + { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, + { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, + { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, + { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, + { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, + { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 }, + { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 }, + { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 }, + { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 }, + { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, + { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, + { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 }, + { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 }, + { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 }, + { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 }, + { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, + { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, + { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, + { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, + { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, + { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, + { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, + { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, + { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, + { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, + { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, + { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 }, + { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 }, + { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 }, + { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, + { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, + { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, + { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, + { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, + { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, + { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, + { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 }, + { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 }, + { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 }, + { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 }, + { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, + { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, + { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, + { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, + { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, + { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 }, + { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 }, + { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, + { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, + { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, + + // 512-bit three source instructions with zero masking. + { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 }, + { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 }, + { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 }, + { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 }, + { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 }, + { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 }, + { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 }, + { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 }, + { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 }, + { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, + { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, + { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, + { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, + { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, + + // 256-bit three source instructions with zero masking. + { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 }, + { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 }, + { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 }, + { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 }, + { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 }, + { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 }, + { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 }, + { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 }, + { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 }, + { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, + { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, + { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, + { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, + { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, + + // 128-bit three source instructions with zero masking. + { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 }, + { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 }, + { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 }, + { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 }, + { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 }, + { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 }, + { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 }, + { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 }, + { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 }, + { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, + { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, + { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, + { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, + { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { @@ -2057,21 +2980,35 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 4, folded load Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } + for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) { + if (I.getGroup()->isKMasked()) { + // Intrinsics need to pass TB_NO_REVERSE. + if (I.getGroup()->isIntrinsic()) { + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE); + } else { + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD); + } + } + } } void X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, MemOp2RegOpTableType &M2RTable, uint16_t RegOp, uint16_t MemOp, uint16_t Flags) { - if ((Flags & TB_NO_FORWARD) == 0) { - assert(!R2MTable.count(RegOp) && "Duplicate entry!"); - R2MTable[RegOp] = std::make_pair(MemOp, Flags); - } - if ((Flags & TB_NO_REVERSE) == 0) { - assert(!M2RTable.count(MemOp) && - "Duplicated entries in unfolding maps?"); - M2RTable[MemOp] = std::make_pair(RegOp, Flags); - } + if ((Flags & TB_NO_FORWARD) == 0) { + assert(!R2MTable.count(RegOp) && "Duplicate entry!"); + R2MTable[RegOp] = std::make_pair(MemOp, Flags); + } + if ((Flags & TB_NO_REVERSE) == 0) { + assert(!M2RTable.count(MemOp) && + "Duplicated entries in unfolding maps?"); + M2RTable[MemOp] = std::make_pair(RegOp, Flags); + } } bool @@ -2235,9 +3172,13 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVAPSZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZrm: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: @@ -2305,9 +3246,13 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::VMOVUPSZmr: case X86::VMOVUPSZ128mr: case X86::VMOVUPSZ256mr: + case X86::VMOVUPSZ128mr_NOVLX: + case X86::VMOVUPSZ256mr_NOVLX: case X86::VMOVAPSZmr: case X86::VMOVAPSZ128mr: case X86::VMOVAPSZ256mr: + case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVAPSZ256mr_NOVLX: case X86::VMOVUPDZmr: case X86::VMOVUPDZ128mr: case X86::VMOVUPDZ256mr: @@ -2409,6 +3354,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, switch (MI.getOpcode()) { default: break; case X86::MOV8rm: + case X86::MOV8rm_NOREX: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: @@ -2418,6 +3364,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: + case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: @@ -2425,25 +3372,27 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: + case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: // AVX-512 + case X86::VMOVSSZrm: + case X86::VMOVSDZrm: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVAPSZrm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQA32Z256rm: @@ -2463,15 +3412,20 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::VMOVDQU8Z128rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU8Zrm: + case X86::VMOVUPDZ128rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVUPDZrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - MI.isInvariantLoad(AA)) { + MI.isDereferenceableInvariantLoad(AA)) { unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; @@ -2694,24 +3648,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, ImplicitOp.setImplicit(); NewSrc = getX86SubSuperRegister(Src.getReg(), 64); - MachineBasicBlock::LivenessQueryResult LQR = - MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); - - switch (LQR) { - case MachineBasicBlock::LQR_Unknown: - // We can't give sane liveness flags to the instruction, abandon LEA - // formation. - return false; - case MachineBasicBlock::LQR_Live: - isKill = MI.killsRegister(SrcReg); - isUndef = false; - break; - default: - // The physreg itself is dead, so we have to use it as an <undef>. - isKill = false; - isUndef = true; - break; - } + isKill = Src.isKill(); + isUndef = Src.isUndef(); } else { // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. @@ -3079,7 +4017,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest) .addOperand(Src), - MI.getOperand(2).getImm()); + MI.getOperand(2)); break; case X86::ADD32ri: case X86::ADD32ri8: @@ -3102,7 +4040,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); - NewMI = addOffset(MIB, MI.getOperand(2).getImm()); + NewMI = addOffset(MIB, MI.getOperand(2)); break; } case X86::ADD16ri: @@ -3116,7 +4054,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest) .addOperand(Src), - MI.getOperand(2).getImm()); + MI.getOperand(2)); break; } @@ -3133,156 +4071,236 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// Returns true if the given instruction opcode is FMA3. -/// Otherwise, returns false. -/// The second parameter is optional and is used as the second return from -/// the function. It is set to true if the given instruction has FMA3 opcode -/// that is used for lowering of scalar FMA intrinsics, and it is set to false -/// otherwise. -static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { - if (IsIntrinsic) - *IsIntrinsic = false; +/// This determines which of three possible cases of a three source commute +/// the source indexes correspond to taking into account any mask operands. +/// All prevents commuting a passthru operand. Returns -1 if the commute isn't +/// possible. +/// Case 0 - Possible to commute the first and second operands. +/// Case 1 - Possible to commute the first and third operands. +/// Case 2 - Possible to commute the second and third operands. +static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); - switch (Opcode) { - case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: - case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: - case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: - case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: - case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: - case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: - case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: - case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: - - case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: - case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: - case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: - case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: - case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: - case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: - case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: - case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: - - case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: - case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: - case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: - case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: - case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: - case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: - case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: - case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: - - case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: - case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: - case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: - case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: - case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: - case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: - case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: - case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: - - case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: - case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: - case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: - case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: - case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: - case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: - case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: - case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: - case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: - case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: - case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: - case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: - case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: - case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: - case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: - case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: - - case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: - case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: - case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: - case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: - case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: - case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: - case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: - case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: - - case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: - case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: - case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: - case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: - case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: - case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: - case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: - case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: - case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: - case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: - case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: - case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: - case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: - case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: - case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: - case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: - - case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: - case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: - case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: - case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: - case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: - case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: - case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: - case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: - - case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: - case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: - case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: - case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: - case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: - case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: - case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: - case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: - case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: - case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: - case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: - case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: - case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: - case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: - case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: - case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: - return true; + unsigned Op1 = 1, Op2 = 2, Op3 = 3; + if (X86II::isKMasked(TSFlags)) { + // The k-mask operand cannot be commuted. + if (SrcOpIdx1 == 2) + return -1; + + // For k-zero-masked operations it is Ok to commute the first vector + // operand. + // For regular k-masked operations a conservative choice is done as the + // elements of the first vector operand, for which the corresponding bit + // in the k-mask operand is set to 0, are copied to the result of the + // instruction. + // TODO/FIXME: The commute still may be legal if it is known that the + // k-mask operand is set to either all ones or all zeroes. + // It is also Ok to commute the 1st operand if all users of MI use only + // the elements enabled by the k-mask operand. For example, + // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] + // : v1[i]; + // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 -> + // // Ok, to commute v1 in FMADD213PSZrk. + if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1) + return -1; + Op2++; + Op3++; + } + + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) + return 0; + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) + return 1; + if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) + return 2; + return -1; +} - case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: - case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: - case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: - case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: - case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: - case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: - case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: - case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: - - case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: - case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: - case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: - case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: - case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: - case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: - case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: - case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: - - case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: - case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: - case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: - case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: - case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: - case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: - case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: - case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: - if (IsIntrinsic) - *IsIntrinsic = true; - return true; - default: - return false; +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( + const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { + + unsigned Opc = MI.getOpcode(); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1) + return 0; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + unsigned FMAForms[3]; + if (FMA3Group.isRegOpcodeFromGroup(Opc)) { + FMAForms[0] = FMA3Group.getReg132Opcode(); + FMAForms[1] = FMA3Group.getReg213Opcode(); + FMAForms[2] = FMA3Group.getReg231Opcode(); + } else { + FMAForms[0] = FMA3Group.getMem132Opcode(); + FMAForms[1] = FMA3Group.getMem213Opcode(); + FMAForms[2] = FMA3Group.getMem231Opcode(); + } + unsigned FormIndex; + for (FormIndex = 0; FormIndex < 3; FormIndex++) + if (Opc == FMAForms[FormIndex]) + break; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FMAForms[FormIndex]; +} + +static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { + uint64_t TSFlags = MI.getDesc().TSFlags; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) + return false; + + // For each case we need to swap two pairs of bits in the final immediate. + static const uint8_t SwapMasks[3][4] = { + { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5. + { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6. + { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6. + }; + + uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm(); + // Clear out the bits we are swapping. + uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | + SwapMasks[Case][2] | SwapMasks[Case][3]); + // If the immediate had a bit of the pair set, then set the opposite bit. + if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1]; + if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0]; + if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3]; + if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2]; + MI.getOperand(MI.getNumOperands()-1).setImm(NewImm); + + return true; +} + +// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be +// commuted. +static bool isCommutableVPERMV3Instruction(unsigned Opcode) { +#define VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ + case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ + case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ + case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ + case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ + case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ + case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ + case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ + case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ + case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ + case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ + case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: + +#define VPERM_CASES_BROADCAST(Suffix) \ + VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ + case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ + case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ + case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ + case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ + case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: + + switch (Opcode) { + default: return false; + VPERM_CASES(B) + VPERM_CASES_BROADCAST(D) + VPERM_CASES_BROADCAST(PD) + VPERM_CASES_BROADCAST(PS) + VPERM_CASES_BROADCAST(Q) + VPERM_CASES(W) + return true; } - llvm_unreachable("Opcode not handled by the switch"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + +// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching +// from the I opcod to the T opcode and vice versa. +static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { +#define VPERM_CASES(Orig, New) \ + case X86::Orig##128rr: return X86::New##128rr; \ + case X86::Orig##128rrkz: return X86::New##128rrkz; \ + case X86::Orig##128rm: return X86::New##128rm; \ + case X86::Orig##128rmkz: return X86::New##128rmkz; \ + case X86::Orig##256rr: return X86::New##256rr; \ + case X86::Orig##256rrkz: return X86::New##256rrkz; \ + case X86::Orig##256rm: return X86::New##256rm; \ + case X86::Orig##256rmkz: return X86::New##256rmkz; \ + case X86::Orig##rr: return X86::New##rr; \ + case X86::Orig##rrkz: return X86::New##rrkz; \ + case X86::Orig##rm: return X86::New##rm; \ + case X86::Orig##rmkz: return X86::New##rmkz; + +#define VPERM_CASES_BROADCAST(Orig, New) \ + VPERM_CASES(Orig, New) \ + case X86::Orig##128rmb: return X86::New##128rmb; \ + case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ + case X86::Orig##256rmb: return X86::New##256rmb; \ + case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ + case X86::Orig##rmb: return X86::New##rmb; \ + case X86::Orig##rmbkz: return X86::New##rmbkz; + + switch (Opcode) { + VPERM_CASES(VPERMI2B, VPERMT2B) + VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) + VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) + VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) + VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) + VPERM_CASES(VPERMI2W, VPERMT2W) + VPERM_CASES(VPERMT2B, VPERMI2B) + VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) + VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) + VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) + VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) + VPERM_CASES(VPERMT2W, VPERMI2W) + } + + llvm_unreachable("Unreachable!"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES } MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -3352,6 +4370,39 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr:{ + // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. + if (!Subtarget.hasSSE41()) + return nullptr; + + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } + + // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy + // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS. + auto &MRI = MI.getParent()->getParent()->getRegInfo(); + auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg()); + unsigned VR128 = MRI.createVirtualRegister(VR128RC); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY), + VR128) + .addReg(MI.getOperand(2).getReg()); + + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.getOperand(2).setReg(VR128); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] @@ -3364,12 +4415,24 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::CMPSDrr: + case X86::CMPSSrr: case X86::CMPPDrri: case X86::CMPPSrri: + case X86::VCMPSDrr: + case X86::VCMPSSrr: case X86::VCMPPDrri: case X86::VCMPPSrri: case X86::VCMPPDYrri: - case X86::VCMPPSYrri: { + case X86::VCMPPSYrri: + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: { // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3).getImm() & 0x7; @@ -3383,6 +4446,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return nullptr; } } + case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: + case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri: + case X86::VPCMPBZrri: case X86::VPCMPUBZrri: + case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri: + case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri: + case X86::VPCMPDZrri: case X86::VPCMPUDZrri: + case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri: + case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri: + case X86::VPCMPQZrri: case X86::VPCMPUQZrri: + case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri: + case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri: + case X86::VPCMPWZrri: case X86::VPCMPUWZrri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI.getOperand(3).getImm() & 0x7; + switch (Imm) { + default: llvm_unreachable("Unreachable!"); + case 0x01: Imm = 0x06; break; // LT -> NLE + case 0x02: Imm = 0x05; break; // LE -> NLT + case 0x05: Imm = 0x02; break; // NLT -> LE + case 0x06: Imm = 0x01; break; // NLE -> LT + case 0x00: // EQ + case 0x03: // FALSE + case 0x04: // NE + case 0x07: // TRUE + break; + } + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::VPCOMBri: case X86::VPCOMUBri: case X86::VPCOMDri: case X86::VPCOMUDri: case X86::VPCOMQri: case X86::VPCOMUQri: @@ -3390,6 +4484,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Flip comparison mode immediate (if necessary). unsigned Imm = MI.getOperand(3).getImm() & 0x7; switch (Imm) { + default: llvm_unreachable("Unreachable!"); case 0x00: Imm = 0x02; break; // LT -> GT case 0x01: Imm = 0x03; break; // LE -> GE case 0x02: Imm = 0x00; break; // GT -> LT @@ -3398,7 +4493,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case 0x05: // NE case 0x06: // FALSE case 0x07: // TRUE - default: break; } auto &WorkingMI = cloneIfNew(MI); @@ -3417,6 +4511,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::MOVHLPSrr: + case X86::UNPCKHPDrr: { + if (!Subtarget.hasSSE2()) + return nullptr; + + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; + case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; + } + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -3490,9 +4600,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - default: - if (isFMA3(MI.getOpcode())) { - unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: { + auto &WorkingMI = cloneIfNew(MI); + if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2)) + return nullptr; + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + default: { + if (isCommutableVPERMV3Instruction(MI.getOpcode())) { + unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) { + unsigned Opc = + getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); if (Opc == 0) return nullptr; auto &WorkingMI = cloneIfNew(MI); @@ -3503,22 +4648,54 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } + } } -bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { +bool X86InstrInfo::findFMA3CommutedOpIndices( + const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { - unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2)) + return false; + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0; +} + +bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + uint64_t TSFlags = MI.getDesc().TSFlags; + + unsigned FirstCommutableVecOp = 1; + unsigned LastCommutableVecOp = 3; + unsigned KMaskOp = 0; + if (X86II::isKMasked(TSFlags)) { + // The k-mask operand has index = 2 for masked and zero-masked operations. + KMaskOp = 2; + + // The operand with index = 1 is used as a source for those elements for + // which the corresponding bit in the k-mask is set to 0. + if (X86II::isKMergeMasked(TSFlags)) + FirstCommutableVecOp = 3; + + LastCommutableVecOp++; + } + + if (isMem(MI, LastCommutableVecOp)) + LastCommutableVecOp--; // Only the first RegOpsNum operands are commutable. // Also, the value 'CommuteAnyOperandIndex' is valid here as it means // that the operand is not specified/fixed. if (SrcOpIdx1 != CommuteAnyOperandIndex && - (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || + SrcOpIdx1 == KMaskOp)) return false; if (SrcOpIdx2 != CommuteAnyOperandIndex && - (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || + SrcOpIdx2 == KMaskOp)) return false; // Look for two different register operands assumed to be commutable @@ -3533,7 +4710,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, if (SrcOpIdx1 == SrcOpIdx2) // Both of operands are not fixed. By default set one of commutable // operands to the last register operand of the instruction. - CommutableOpIdx2 = RegOpsNum; + CommutableOpIdx2 = LastCommutableVecOp; else if (SrcOpIdx2 == CommuteAnyOperandIndex) // Only one of operands is not fixed. CommutableOpIdx2 = SrcOpIdx1; @@ -3541,7 +4718,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); - for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + for (CommutableOpIdx1 = LastCommutableVecOp; + CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { + // Just ignore and skip the k-mask operand. + if (CommutableOpIdx1 == KMaskOp) + continue; + // The commuted operands must have different registers. // Otherwise, the commute transformation does not change anything and // is useless then. @@ -3550,7 +4732,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, } // No appropriate commutable operands were found. - if (CommutableOpIdx1 == 0) + if (CommutableOpIdx1 < FirstCommutableVecOp) return false; // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 @@ -3560,208 +4742,34 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, return false; } - // Check if we can adjust the opcode to preserve the semantics when - // commute the register operands. - return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; -} - -unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( - MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const { - unsigned Opc = MI.getOpcode(); - - // Define the array that holds FMA opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t RegularOpcodeGroups[][3] = { - { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, - { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, - { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, - { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, - { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, - { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, - { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, - { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, - { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, - { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, - - { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, - { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, - { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, - { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, - { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, - { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, - { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, - - { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, - { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, - { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, - { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, - { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, - { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, - { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, - - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, - { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, - { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, - { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, - { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, - { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, - { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, - - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, - { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, - { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, - { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, - { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, - - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, - { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, - { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, - { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, - { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } - }; - - // Define the array that holds FMA*_Int opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t IntrinOpcodeGroups[][3] = { - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, - { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, - { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, - - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, - { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, - { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, - - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, - { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, - { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, - - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, - { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, - { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, - }; - - const unsigned Form132Index = 0; - const unsigned Form213Index = 1; - const unsigned Form231Index = 2; - const unsigned FormsNum = 3; - - bool IsIntrinOpcode; - isFMA3(Opc, &IsIntrinOpcode); - - size_t GroupsNum; - const uint16_t (*OpcodeGroups)[3]; - if (IsIntrinOpcode) { - GroupsNum = array_lengthof(IntrinOpcodeGroups); - OpcodeGroups = IntrinOpcodeGroups; - } else { - GroupsNum = array_lengthof(RegularOpcodeGroups); - OpcodeGroups = RegularOpcodeGroups; - } - - const uint16_t *FoundOpcodesGroup = nullptr; - size_t FormIndex; - - // Look for the input opcode in the corresponding opcodes table. - for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; - ++GroupIndex) { - for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { - if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { - FoundOpcodesGroup = OpcodeGroups[GroupIndex]; - break; - } - } - } - - // The input opcode does not match with any of the opcodes from the tables. - // The unsupported FMA opcode must be added to one of the two opcode groups - // defined above. - assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); - - // Put the lowest index to SrcOpIdx1 to simplify the checks below. - if (SrcOpIdx1 > SrcOpIdx2) - std::swap(SrcOpIdx1, SrcOpIdx2); - - // TODO: Commuting the 1st operand of FMA*_Int requires some additional - // analysis. The commute optimization is legal only if all users of FMA*_Int - // use only the lowest element of the FMA*_Int instruction. Such analysis are - // not implemented yet. So, just return 0 in that case. - // When such analysis are available this place will be the right place for - // calling it. - if (IsIntrinOpcode && SrcOpIdx1 == 1) - return 0; - - unsigned Case; - if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) - Case = 0; - else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) - Case = 1; - else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) - Case = 2; - else - return 0; - - // Define the FMA forms mapping array that helps to map input FMA form - // to output FMA form to preserve the operation semantics after - // commuting the operands. - static const unsigned FormMapping[][3] = { - // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; - // FMA132 A, C, b; ==> FMA231 C, A, b; - // FMA213 B, A, c; ==> FMA213 A, B, c; - // FMA231 C, A, b; ==> FMA132 A, C, b; - { Form231Index, Form213Index, Form132Index }, - // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; - // FMA132 A, c, B; ==> FMA132 B, c, A; - // FMA213 B, a, C; ==> FMA231 C, a, B; - // FMA231 C, a, B; ==> FMA213 B, a, C; - { Form132Index, Form231Index, Form213Index }, - // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; - // FMA132 a, C, B; ==> FMA213 a, B, C; - // FMA213 b, A, C; ==> FMA132 b, C, A; - // FMA231 c, A, B; ==> FMA231 c, B, A; - { Form213Index, Form132Index, Form231Index } - }; - - // Everything is ready, just adjust the FMA opcode and return it. - FormIndex = FormMapping[Case][FormIndex]; - return FoundOpcodesGroup[FormIndex]; + return true; } bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { + const MCInstrDesc &Desc = MI.getDesc(); + if (!Desc.isCommutable()) + return false; + switch (MI.getOpcode()) { + case X86::CMPSDrr: + case X86::CMPSSrr: case X86::CMPPDrri: case X86::CMPPSrri: + case X86::VCMPSDrr: + case X86::VCMPSSrr: case X86::VCMPPDrri: case X86::VCMPPSrri: case X86::VCMPPDYrri: - case X86::VCMPPSYrri: { + case X86::VCMPPSYrri: + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: { // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3).getImm() & 0x7; @@ -3776,9 +4784,73 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, } return false; } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (Subtarget.hasSSE41()) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; + } + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: + return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); default: - if (isFMA3(MI.getOpcode())) - return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group); + + // Handled masked instructions since we need to skip over the mask input + // and the preserved input. + if (Desc.TSFlags & X86II::EVEX_K) { + // First assume that the first input is the mask operand and skip past it. + unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; + unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2; + // Check if the first input is tied. If there isn't one then we only + // need to skip the mask operand which we did above. + if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(), + MCOI::TIED_TO) != -1)) { + // If this is zero masking instruction with a tied operand, we need to + // move the first index back to the first input since this must + // be a 3 input instruction and we want the first two non-mask inputs. + // Otherwise this is a 2 input instruction with a preserved input and + // mask, so we need to move the indices to skip one more input. + if (Desc.TSFlags & X86II::EVEX_Z) + --CommutableOpIdx1; + else { + ++CommutableOpIdx1; + ++CommutableOpIdx2; + } + } + + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + + if (!MI.getOperand(SrcOpIdx1).isReg() || + !MI.getOperand(SrcOpIdx2).isReg()) + // No idea. + return false; + return true; + } + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; @@ -4296,7 +5368,10 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, return true; } -unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { +unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { + assert(!BytesRemoved && "code size not handled"); + MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; @@ -4316,15 +5391,17 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return Count; } -unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, +unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const { + const DebugLoc &DL, + int *BytesAdded) const { // Shouldn't be a fall through. - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert(TBB && "insertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && "X86 branch conditions have one component!"); + assert(!BytesAdded && "code size not handled"); if (Cond.empty()) { // Unconditional branch? @@ -4430,16 +5507,63 @@ static bool isHReg(unsigned Reg) { } // Try and copy between VR128/VR64 and GR64 registers. -static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, +static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg, const X86Subtarget &Subtarget) { + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + + // SrcReg(MaskReg) -> DestReg(GR64) + // SrcReg(MaskReg) -> DestReg(GR32) + // SrcReg(MaskReg) -> DestReg(GR16) + // SrcReg(MaskReg) -> DestReg(GR8) + + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + if (X86::VK16RegClass.contains(SrcReg)) { + if (X86::GR64RegClass.contains(DestReg)) { + assert(Subtarget.hasBWI()); + return X86::KMOVQrk; + } + if (X86::GR32RegClass.contains(DestReg)) + return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; + if (X86::GR16RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVWrk; + } + if (X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk; + } + } + + // SrcReg(GR64) -> DestReg(MaskReg) + // SrcReg(GR32) -> DestReg(MaskReg) + // SrcReg(GR16) -> DestReg(MaskReg) + // SrcReg(GR8) -> DestReg(MaskReg) + + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + if (X86::VK16RegClass.contains(DestReg)) { + if (X86::GR64RegClass.contains(SrcReg)) { + assert(Subtarget.hasBWI()); + return X86::KMOVQkr; + } + if (X86::GR32RegClass.contains(SrcReg)) + return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; + if (X86::GR16RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVWkr; + } + if (X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr; + } + } + // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) - bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. @@ -4479,96 +5603,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -static bool isMaskRegClass(const TargetRegisterClass *RC) { - // All KMASK RegClasses hold the same k registers, can be tested against anyone. - return X86::VK16RegClass.hasSubClassEq(RC); -} - -static bool MaskRegClassContains(unsigned Reg) { - // All KMASK RegClasses hold the same k registers, can be tested against anyone. - return X86::VK16RegClass.contains(Reg); -} - -static bool GRRegClassContains(unsigned Reg) { - return X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg) || - X86::GR16RegClass.contains(Reg) || - X86::GR8RegClass.contains(Reg); -} -static -unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { - if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVBrk; - } - if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVBkr; - } - return 0; -} - -static -unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { - if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) - return X86::KMOVQkk; - if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) - return X86::KMOVDrk; - if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) - return X86::KMOVQrk; - if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) - return X86::KMOVDkr; - if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) - return X86::KMOVQkr; - return 0; -} - -static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, - const X86Subtarget &Subtarget) -{ - if (Subtarget.hasDQI()) - if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) - return Opc; - if (Subtarget.hasBWI()) - if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) - return Opc; - if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { - if (Subtarget.hasVLX()) - return X86::VMOVAPSZ128rr; - DestReg = get512BitSuperRegister(DestReg); - SrcReg = get512BitSuperRegister(SrcReg); - return X86::VMOVAPSZrr; - } - if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { - if (Subtarget.hasVLX()) - return X86::VMOVAPSZ256rr; - DestReg = get512BitSuperRegister(DestReg); - SrcReg = get512BitSuperRegister(SrcReg); - return X86::VMOVAPSZrr; - } - if (X86::VR512RegClass.contains(DestReg, SrcReg)) - return X86::VMOVAPSZrr; - if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) - return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVWkr; - } - if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVWrk; - } - return 0; -} - void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); + bool HasVLX = Subtarget.hasVLX(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; @@ -4590,12 +5631,41 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; - else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); - else if (X86::VR128RegClass.contains(DestReg, SrcReg)) - Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; - else if (X86::VR256RegClass.contains(DestReg, SrcReg)) - Opc = X86::VMOVAPSYrr; + else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { + if (HasVLX) + Opc = X86::VMOVAPSZ128rr; + else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; + else { + // If this an extended register and we don't have VLX we need to use a + // 512-bit move. + Opc = X86::VMOVAPSZrr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, + &X86::VR512RegClass); + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, + &X86::VR512RegClass); + } + } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { + if (HasVLX) + Opc = X86::VMOVAPSZ256rr; + else if (X86::VR256RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSYrr; + else { + // If this an extended register and we don't have VLX we need to use a + // 512-bit move. + Opc = X86::VMOVAPSZrr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, + &X86::VR512RegClass); + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, + &X86::VR512RegClass); + } + } else if (X86::VR512RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSZrr; + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + else if (X86::VK16RegClass.contains(DestReg, SrcReg)) + Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk; if (!Opc) Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); @@ -4708,37 +5778,15 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Cannot emit physreg copy instruction"); } -static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC, - bool load) { - switch (RC->getSize()) { - default: - llvm_unreachable("Unknown spill size"); - case 2: - return load ? X86::KMOVWkm : X86::KMOVWmk; - case 4: - return load ? X86::KMOVDkm : X86::KMOVDmk; - case 8: - return load ? X86::KMOVQkm : X86::KMOVQmk; - } -} - static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI, bool load) { - if (STI.hasAVX512()) { - if (isMaskRegClass(RC)) - return getLoadStoreMaskRegOpcode(RC, load); - if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) - return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; - if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) - return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; - if (X86::VR512RegClass.hasSubClassEq(RC)) - return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; - } - bool HasAVX = STI.hasAVX(); + bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); + switch (RC->getSize()) { default: llvm_unreachable("Unknown spill size"); @@ -4751,69 +5799,85 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; return load ? X86::MOV8rm : X86::MOV8mr; case 2: + if (X86::VK16RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVWkm : X86::KMOVWmk; assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: if (X86::GR32RegClass.hasSubClassEq(RC)) return load ? X86::MOV32rm : X86::MOV32mr; - if (X86::FR32RegClass.hasSubClassEq(RC)) + if (X86::FR32XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : - (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : + (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; + if (X86::VK32RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVDkm : X86::KMOVDmk; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; - if (X86::FR64RegClass.hasSubClassEq(RC)) + if (X86::FR64XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : - (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : + (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp64m : X86::ST_Fp64m; + if (X86::VK64RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVQkm : X86::KMOVQmk; llvm_unreachable("Unknown 8-byte regclass"); case 10: assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert((X86::VR128RegClass.hasSubClassEq(RC) || - X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); + assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR128RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) - : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); - else - return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) - : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr; + return load ? + (HasVLX ? X86::VMOVAPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVAPSrm : + X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : + HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : + HasAVX ? X86::VMOVAPSmr : + X86::MOVAPSmr); else - return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr; + return load ? + (HasVLX ? X86::VMOVUPSZ128rm : + HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : + HasAVX ? X86::VMOVUPSrm : + X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : + HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : + HasAVX ? X86::VMOVUPSmr : + X86::MOVUPSmr); } case 32: - assert((X86::VR256RegClass.hasSubClassEq(RC) || - X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); + assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR256RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; - else - return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr; + return load ? + (HasVLX ? X86::VMOVAPSZ256rm : + HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : + X86::VMOVAPSYrm) : + (HasVLX ? X86::VMOVAPSZ256mr : + HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX : + X86::VMOVAPSYmr); else - return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr; + return load ? + (HasVLX ? X86::VMOVUPSZ256rm : + HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX : + X86::VMOVUPSYrm) : + (HasVLX ? X86::VMOVUPSZ256mr : + HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX : + X86::VMOVUPSYmr); case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); - assert(STI.hasVLX() && "Using 512-bit register requires AVX512"); + assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); if (isStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else @@ -4851,8 +5915,7 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, Offset = DispMO.getImm(); - return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == - X86::NoRegister; + return true; } static unsigned getStoreRegOpcode(unsigned SrcReg, @@ -4876,7 +5939,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && + assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = @@ -4954,6 +6017,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: + if (!MI.getOperand(1).isImm()) + return false; SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; CmpMask = ~0; @@ -4985,6 +6050,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: + if (!MI.getOperand(2).isImm()) + return false; SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; @@ -5263,9 +6330,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // If the definition is in this basic block, RE points to the definition; // otherwise, RE is the rend of the basic block. MachineBasicBlock::reverse_iterator - RI = MachineBasicBlock::reverse_iterator(I), + RI = ++I.getReverse(), RE = CmpInstr.getParent() == MI->getParent() - ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ + ? Def.getReverse() /* points to MI */ : CmpInstr.getParent()->rend(); MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { @@ -5411,9 +6478,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (Movr0Inst) { // Look backwards until we find a def that doesn't use the current EFLAGS. Def = Sub; - MachineBasicBlock::reverse_iterator - InsertI = MachineBasicBlock::reverse_iterator(++Def), - InsertE = Sub->getParent()->rend(); + MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(), + InsertE = Sub->getParent()->rend(); for (; InsertI != InsertE; ++InsertI) { MachineInstr *Instr = &*InsertI; if (!Instr->readsRegister(X86::EFLAGS, TRI) && @@ -5455,14 +6521,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { - if (FoldAsLoadDefReg == 0) - return nullptr; - // To be conservative, if there exists another load, clear the load candidate. - if (MI.mayLoad()) { - FoldAsLoadDefReg = 0; - return nullptr; - } - // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); @@ -5471,27 +6529,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, return nullptr; // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { + SmallVector<unsigned, 1> SrcOperandIds; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + // Do not fold if we have a subreg use or a def. + if (MO.getSubReg() || MO.isDef()) return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; + SrcOperandIds.push_back(i); } - if (!FoundSrcOperand) + if (SrcOperandIds.empty()) return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -5553,7 +6608,9 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } -bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { +static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, + const TargetInstrInfo &TII, + const X86Subtarget &Subtarget) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); int64_t Imm = MIB->getOperand(1).getImm(); @@ -5570,23 +6627,23 @@ bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { X86MachineFunctionInfo *X86FI = MBB.getParent()->getInfo<X86MachineFunctionInfo>(); if (X86FI->getUsesRedZone()) { - MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri - : X86::MOV64ri)); + MIB->setDesc(TII.get(MIB->getOpcode() == + X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri)); return true; } // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and // widen the register if necessary. StackAdjustment = 8; - BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); - MIB->setDesc(get(X86::POP64r)); + BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(TII.get(X86::POP64r)); MIB->getOperand(0) .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); } else { assert(MIB->getOpcode() == X86::MOV32ImmSExti8); StackAdjustment = 4; - BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); - MIB->setDesc(get(X86::POP32r)); + BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(TII.get(X86::POP32r)); } // Build CFI if necessary. @@ -5616,7 +6673,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, unsigned Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); - auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + auto Flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); @@ -5629,6 +6688,53 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that loads the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXLoad(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &LoadDesc, + const MCInstrDesc &BroadcastDesc, + unsigned SubIdx) { + unsigned DestReg = MIB->getOperand(0).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(DestReg) < 16) { + // We can use a normal VEX encoded load. + MIB->setDesc(LoadDesc); + } else { + // Use a 128/256-bit VBROADCAST instruction. + MIB->setDesc(BroadcastDesc); + // Change the destination to a 512-bit register. + DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(0).setReg(DestReg); + } + return true; +} + +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that stores the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXStore(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &StoreDesc, + const MCInstrDesc &ExtractDesc, + unsigned SubIdx) { + unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(SrcReg) < 16) { + // We can use a normal VEX encoded store. + MIB->setDesc(StoreDesc); + } else { + // Use a VEXTRACTF instruction. + MIB->setDesc(ExtractDesc); + // Change the destination to a 512-bit register. + SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg); + MIB.addImm(0x0); // Append immediate to extract from the lower bits. + } + + return true; +} bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -5641,7 +6747,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); case X86::MOV32ImmSExti8: case X86::MOV64ImmSExti8: - return ExpandMOVImmSExti8(MIB); + return ExpandMOVImmSExti8(MIB, *this, Subtarget); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -5663,6 +6769,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SD: + return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -5676,6 +6785,45 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(Reg, RegState::Undef).addImm(0xff); return true; } + case X86::AVX512_512_SEXT_MASK_32: + case X86::AVX512_512_SEXT_MASK_64: { + unsigned Reg = MIB->getOperand(0).getReg(); + unsigned MaskReg = MIB->getOperand(1).getReg(); + unsigned MaskState = getRegState(MIB->getOperand(1)); + unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? + X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; + MI.RemoveOperand(1); + MIB->setDesc(get(Opc)); + // VPTERNLOG needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } + case X86::VMOVAPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVUPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVAPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVUPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVAPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVUPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVAPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); + case X86::VMOVUPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::TEST8ri_NOREX: MI.setDesc(get(X86::TEST8ri)); return true; @@ -5801,6 +6949,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: // Attempt to convert the load of inserted vector into a fold load // of a single float. if (OpNum == 2) { @@ -5814,8 +6963,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = - (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm - : X86::INSERTPSrm); + (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : + (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : + X86::INSERTPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); @@ -5825,6 +6975,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( break; case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: // Move the upper 64-bits of the second operand to the lower 64-bits. // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. @@ -5832,8 +6983,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); if (Size <= RCSize && 8 <= Align) { unsigned NewOpCode = - (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm - : X86::MOVLPSrm); + (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : + (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : + X86::MOVLPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); return NewMI; @@ -6042,12 +7194,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::CVTSI2SD64rm: case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: - case X86::Int_CVTSD2SSrr: - case X86::Int_CVTSD2SSrm: case X86::CVTSS2SDrr: case X86::CVTSS2SDrm: - case X86::Int_CVTSS2SDrr: - case X86::Int_CVTSS2SDrm: case X86::MOVHPDrm: case X86::MOVHPSrm: case X86::MOVLPDrm: @@ -6058,10 +7206,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RCPSSm_Int: case X86::ROUNDSDr: case X86::ROUNDSDm: - case X86::ROUNDSDr_Int: case X86::ROUNDSSr: case X86::ROUNDSSm: - case X86::ROUNDSSr_Int: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: @@ -6134,28 +7280,95 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::Int_VCVTSS2SDrr: case X86::Int_VCVTSS2SDrm: case X86::VRCPSSr: + case X86::VRCPSSr_Int: case X86::VRCPSSm: case X86::VRCPSSm_Int: case X86::VROUNDSDr: case X86::VROUNDSDm: case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: case X86::VROUNDSSr: case X86::VROUNDSSm: case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: case X86::VRSQRTSSr: + case X86::VRSQRTSSr_Int: case X86::VRSQRTSSm: case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: - // AVX-512 + // AVX-512 + case X86::VCVTSI2SSZrr: + case X86::VCVTSI2SSZrm: + case X86::VCVTSI2SSZrr_Int: + case X86::VCVTSI2SSZrrb_Int: + case X86::VCVTSI2SSZrm_Int: + case X86::VCVTSI642SSZrr: + case X86::VCVTSI642SSZrm: + case X86::VCVTSI642SSZrr_Int: + case X86::VCVTSI642SSZrrb_Int: + case X86::VCVTSI642SSZrm_Int: + case X86::VCVTSI2SDZrr: + case X86::VCVTSI2SDZrm: + case X86::VCVTSI2SDZrr_Int: + case X86::VCVTSI2SDZrrb_Int: + case X86::VCVTSI2SDZrm_Int: + case X86::VCVTSI642SDZrr: + case X86::VCVTSI642SDZrm: + case X86::VCVTSI642SDZrr_Int: + case X86::VCVTSI642SDZrrb_Int: + case X86::VCVTSI642SDZrm_Int: + case X86::VCVTUSI2SSZrr: + case X86::VCVTUSI2SSZrm: + case X86::VCVTUSI2SSZrr_Int: + case X86::VCVTUSI2SSZrrb_Int: + case X86::VCVTUSI2SSZrm_Int: + case X86::VCVTUSI642SSZrr: + case X86::VCVTUSI642SSZrm: + case X86::VCVTUSI642SSZrr_Int: + case X86::VCVTUSI642SSZrrb_Int: + case X86::VCVTUSI642SSZrm_Int: + case X86::VCVTUSI2SDZrr: + case X86::VCVTUSI2SDZrm: + case X86::VCVTUSI2SDZrr_Int: + case X86::VCVTUSI2SDZrm_Int: + case X86::VCVTUSI642SDZrr: + case X86::VCVTUSI642SDZrm: + case X86::VCVTUSI642SDZrr_Int: + case X86::VCVTUSI642SDZrrb_Int: + case X86::VCVTUSI642SDZrm_Int: case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrrb: case X86::VCVTSD2SSZrm: case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrrb: case X86::VCVTSS2SDZrm: + case X86::VRNDSCALESDr: + case X86::VRNDSCALESDrb: + case X86::VRNDSCALESDm: + case X86::VRNDSCALESSr: + case X86::VRNDSCALESSrb: + case X86::VRNDSCALESSm: + case X86::VRCP14SSrr: + case X86::VRCP14SSrm: + case X86::VRSQRT14SSrr: + case X86::VRSQRT14SSrm: + case X86::VSQRTSSZr: + case X86::VSQRTSSZr_Int: + case X86::VSQRTSSZrb_Int: + case X86::VSQRTSSZm: + case X86::VSQRTSSZm_Int: + case X86::VSQRTSDZr: + case X86::VSQRTSDZr_Int: + case X86::VSQRTSDZrb_Int: + case X86::VSQRTSDZm: + case X86::VSQRTSDZm_Int: return true; } @@ -6233,9 +7446,17 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned Size = MFI->getObjectSize(FrameIndex); - unsigned Alignment = MFI->getObjectAlignment(FrameIndex); + // Don't fold subreg spills, or reloads that use a high subreg. + for (auto Op : Ops) { + MachineOperand &MO = MI.getOperand(Op); + auto SubReg = MO.getSubReg(); + if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) + return nullptr; + } + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + unsigned Size = MFI.getObjectSize(FrameIndex); + unsigned Alignment = MFI.getObjectAlignment(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) @@ -6295,15 +7516,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SS). switch (UserOpc) { case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: + case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int: case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: + case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int: + case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: - case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: - case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: - case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: - case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: - case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: - case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: + case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: + case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: + case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int: + case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int: + case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int: + case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int: + case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int: + case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int: + case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int: + case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int: + case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int: + case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int: + case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int: + case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int: return false; default: return true; @@ -6317,15 +7549,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SD). switch (UserOpc) { case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: + case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int: case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: + case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int: + case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: - case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: - case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: - case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: - case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: - case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: - case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: + case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: + case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: + case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int: + case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int: + case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int: + case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int: + case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int: + case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int: + case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int: + case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int: + case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int: + case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int: + case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int: + case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int: return false; default: return true; @@ -6339,6 +7582,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, LiveIntervals *LIS) const { + + // TODO: Support the case where LoadMI loads a wide register, but MI + // only uses a subreg. + for (auto Op : Ops) { + if (MI.getOperand(Op).getSubReg()) + return nullptr; + } + // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; @@ -6376,9 +7627,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 16; break; case X86::FsFLD0SD: + case X86::AVX512_FsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: Alignment = 4; break; default: @@ -6415,7 +7668,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: - case X86::FsFLD0SS: { + case X86::AVX512_FsFLD0SD: + case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -6441,9 +7696,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI.getOpcode(); - if (Opc == X86::FsFLD0SS) + if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD) + else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); @@ -6649,7 +7904,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, @@ -6694,7 +7949,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = @@ -6746,8 +8001,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -6757,8 +8010,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -6776,6 +8027,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -6786,6 +8039,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: @@ -6823,8 +8078,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -6834,8 +8087,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -6853,6 +8104,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -6863,6 +8116,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: @@ -6960,8 +8215,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First, - MachineInstr &Second) const { +bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, + const MachineInstr &Second) const { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -7120,7 +8375,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First, } bool X86InstrInfo:: -ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { +reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); Cond[0].setImm(GetOppositeBranchCondition(CC)); @@ -7168,7 +8423,10 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, - { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, + { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, + { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -7184,7 +8442,10 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, - { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, + { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, + { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, @@ -7200,7 +8461,26 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, - { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + // AVX512 support + { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr }, + { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, + { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, + { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm }, + { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r }, + { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r }, + { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr }, + { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r }, + { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, + { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm }, }; static const uint16_t ReplaceableInstrsAVX2[][3] = { @@ -7224,22 +8504,257 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, - { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} + { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}, + { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 }, +}; + +static const uint16_t ReplaceableInstrsAVX512[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble PackedInt PackedInt + { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr }, + { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm }, + { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr }, + { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr }, + { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm }, + { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr }, + { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm }, + { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr }, + { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr }, + { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm }, + { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr }, + { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm }, + { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr }, + { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr }, + { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm }, +}; + +static const uint16_t ReplaceableInstrsAVX512DQ[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble PackedInt PackedInt + { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm }, + { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr }, + { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm }, + { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr }, + { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm }, + { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr }, + { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm }, + { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr }, + { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm }, + { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr }, + { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm }, + { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr }, + { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm }, + { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr }, + { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm }, + { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr }, + { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm }, + { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr }, + { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm }, + { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr }, + { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm }, + { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr }, + { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm }, + { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr }, +}; + +static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble + //PackedInt PackedInt + { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk, + X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk }, + { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz, + X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz }, + { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk, + X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk }, + { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz, + X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz }, + { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk, + X86::VPANDQZ128rmk, X86::VPANDDZ128rmk }, + { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz, + X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz }, + { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk, + X86::VPANDQZ128rrk, X86::VPANDDZ128rrk }, + { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz, + X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz }, + { X86::VORPSZ128rmk, X86::VORPDZ128rmk, + X86::VPORQZ128rmk, X86::VPORDZ128rmk }, + { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz, + X86::VPORQZ128rmkz, X86::VPORDZ128rmkz }, + { X86::VORPSZ128rrk, X86::VORPDZ128rrk, + X86::VPORQZ128rrk, X86::VPORDZ128rrk }, + { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz, + X86::VPORQZ128rrkz, X86::VPORDZ128rrkz }, + { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk, + X86::VPXORQZ128rmk, X86::VPXORDZ128rmk }, + { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz, + X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz }, + { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk, + X86::VPXORQZ128rrk, X86::VPXORDZ128rrk }, + { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz, + X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz }, + { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk, + X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk }, + { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz, + X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz }, + { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk, + X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk }, + { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz, + X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz }, + { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk, + X86::VPANDQZ256rmk, X86::VPANDDZ256rmk }, + { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz, + X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz }, + { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk, + X86::VPANDQZ256rrk, X86::VPANDDZ256rrk }, + { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz, + X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz }, + { X86::VORPSZ256rmk, X86::VORPDZ256rmk, + X86::VPORQZ256rmk, X86::VPORDZ256rmk }, + { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz, + X86::VPORQZ256rmkz, X86::VPORDZ256rmkz }, + { X86::VORPSZ256rrk, X86::VORPDZ256rrk, + X86::VPORQZ256rrk, X86::VPORDZ256rrk }, + { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz, + X86::VPORQZ256rrkz, X86::VPORDZ256rrkz }, + { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk, + X86::VPXORQZ256rmk, X86::VPXORDZ256rmk }, + { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz, + X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz }, + { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk, + X86::VPXORQZ256rrk, X86::VPXORDZ256rrk }, + { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz, + X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz }, + { X86::VANDNPSZrmk, X86::VANDNPDZrmk, + X86::VPANDNQZrmk, X86::VPANDNDZrmk }, + { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz, + X86::VPANDNQZrmkz, X86::VPANDNDZrmkz }, + { X86::VANDNPSZrrk, X86::VANDNPDZrrk, + X86::VPANDNQZrrk, X86::VPANDNDZrrk }, + { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz, + X86::VPANDNQZrrkz, X86::VPANDNDZrrkz }, + { X86::VANDPSZrmk, X86::VANDPDZrmk, + X86::VPANDQZrmk, X86::VPANDDZrmk }, + { X86::VANDPSZrmkz, X86::VANDPDZrmkz, + X86::VPANDQZrmkz, X86::VPANDDZrmkz }, + { X86::VANDPSZrrk, X86::VANDPDZrrk, + X86::VPANDQZrrk, X86::VPANDDZrrk }, + { X86::VANDPSZrrkz, X86::VANDPDZrrkz, + X86::VPANDQZrrkz, X86::VPANDDZrrkz }, + { X86::VORPSZrmk, X86::VORPDZrmk, + X86::VPORQZrmk, X86::VPORDZrmk }, + { X86::VORPSZrmkz, X86::VORPDZrmkz, + X86::VPORQZrmkz, X86::VPORDZrmkz }, + { X86::VORPSZrrk, X86::VORPDZrrk, + X86::VPORQZrrk, X86::VPORDZrrk }, + { X86::VORPSZrrkz, X86::VORPDZrrkz, + X86::VPORQZrrkz, X86::VPORDZrrkz }, + { X86::VXORPSZrmk, X86::VXORPDZrmk, + X86::VPXORQZrmk, X86::VPXORDZrmk }, + { X86::VXORPSZrmkz, X86::VXORPDZrmkz, + X86::VPXORQZrmkz, X86::VPXORDZrmkz }, + { X86::VXORPSZrrk, X86::VXORPDZrrk, + X86::VPXORQZrrk, X86::VPXORDZrrk }, + { X86::VXORPSZrrkz, X86::VXORPDZrrkz, + X86::VPXORQZrrkz, X86::VPXORDZrrkz }, + // Broadcast loads can be handled the same as masked operations to avoid + // changing element size. + { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb, + X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb }, + { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb, + X86::VPANDQZ128rmb, X86::VPANDDZ128rmb }, + { X86::VORPSZ128rmb, X86::VORPDZ128rmb, + X86::VPORQZ128rmb, X86::VPORDZ128rmb }, + { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb, + X86::VPXORQZ128rmb, X86::VPXORDZ128rmb }, + { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb, + X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb }, + { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb, + X86::VPANDQZ256rmb, X86::VPANDDZ256rmb }, + { X86::VORPSZ256rmb, X86::VORPDZ256rmb, + X86::VPORQZ256rmb, X86::VPORDZ256rmb }, + { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb, + X86::VPXORQZ256rmb, X86::VPXORDZ256rmb }, + { X86::VANDNPSZrmb, X86::VANDNPDZrmb, + X86::VPANDNQZrmb, X86::VPANDNDZrmb }, + { X86::VANDPSZrmb, X86::VANDPDZrmb, + X86::VPANDQZrmb, X86::VPANDDZrmb }, + { X86::VANDPSZrmb, X86::VANDPDZrmb, + X86::VPANDQZrmb, X86::VPANDDZrmb }, + { X86::VORPSZrmb, X86::VORPDZrmb, + X86::VPORQZrmb, X86::VPORDZrmb }, + { X86::VXORPSZrmb, X86::VXORPDZrmb, + X86::VPXORQZrmb, X86::VPXORDZrmb }, + { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk, + X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk }, + { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk, + X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk }, + { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk, + X86::VPORQZ128rmbk, X86::VPORDZ128rmbk }, + { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk, + X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk }, + { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk, + X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk }, + { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk, + X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk }, + { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk, + X86::VPORQZ256rmbk, X86::VPORDZ256rmbk }, + { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk, + X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk }, + { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk, + X86::VPANDNQZrmbk, X86::VPANDNDZrmbk }, + { X86::VANDPSZrmbk, X86::VANDPDZrmbk, + X86::VPANDQZrmbk, X86::VPANDDZrmbk }, + { X86::VANDPSZrmbk, X86::VANDPDZrmbk, + X86::VPANDQZrmbk, X86::VPANDDZrmbk }, + { X86::VORPSZrmbk, X86::VORPDZrmbk, + X86::VPORQZrmbk, X86::VPORDZrmbk }, + { X86::VXORPSZrmbk, X86::VXORPDZrmbk, + X86::VPXORQZrmbk, X86::VPXORDZrmbk }, + { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz, + X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz}, + { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz, + X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz }, + { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz, + X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz }, + { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz, + X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz }, + { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz, + X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz}, + { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz, + X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz }, + { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz, + X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz }, + { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz, + X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz }, + { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz, + X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz }, + { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, + X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, + { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, + X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, + { X86::VORPSZrmbkz, X86::VORPDZrmbkz, + X86::VPORQZrmbkz, X86::VPORDZrmbkz }, + { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz, + X86::VPXORQZrmbkz, X86::VPXORDZrmbkz }, }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. -static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (const uint16_t (&Row)[3] : ReplaceableInstrs) +static const uint16_t *lookup(unsigned opcode, unsigned domain, + ArrayRef<uint16_t[3]> Table) { + for (const uint16_t (&Row)[3] : Table) if (Row[domain-1] == opcode) return Row; return nullptr; } -static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) - if (Row[domain-1] == opcode) +static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain, + ArrayRef<uint16_t[4]> Table) { + // If this is the integer domain make sure to check both integer columns. + for (const uint16_t (&Row)[4] : Table) + if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode)) return Row; return nullptr; } @@ -7247,12 +8762,25 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { std::pair<uint16_t, uint16_t> X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; - bool hasAVX2 = Subtarget.hasAVX2(); + unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; - if (domain && lookup(MI.getOpcode(), domain)) - validDomains = 0xe; - else if (domain && lookupAVX2(MI.getOpcode(), domain)) - validDomains = hasAVX2 ? 0xe : 0x6; + if (domain) { + if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) { + validDomains = 0xe; + } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { + validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) { + validDomains = 0xe; + } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) { + validDomains = Subtarget.hasDQI() ? 0xe : 0x8; + } else if (const uint16_t *table = lookupAVX512(opcode, domain, + ReplaceableInstrsAVX512DQMasked)) { + if (domain == 1 || (domain == 3 && table[3] == opcode)) + validDomains = Subtarget.hasDQI() ? 0xa : 0x8; + else + validDomains = Subtarget.hasDQI() ? 0xc : 0x8; + } + } return std::make_pair(domain, validDomains); } @@ -7260,11 +8788,32 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); - const uint16_t *table = lookup(MI.getOpcode(), dom); + const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); if (!table) { // try the other table assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); - table = lookupAVX2(MI.getOpcode(), dom); + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); + } + if (!table) { // try the AVX512 table + assert(Subtarget.hasAVX512() && "Requires AVX-512"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512); + // Don't change integer Q instructions to D instructions. + if (table && Domain == 3 && table[3] == MI.getOpcode()) + Domain = 4; + } + if (!table) { // try the AVX512DQ table + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ); + // Don't change integer Q instructions to D instructions and + // use D intructions if we started with a PS instruction. + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) + Domain = 4; + } + if (!table) { // try the AVX512DQMasked table + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked); + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) + Domain = 4; } assert(table && "Cannot change domain"); MI.setDesc(get(table[Domain - 1])); @@ -7275,32 +8824,6 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } -// This code must remain in sync with getJumpInstrTableEntryBound in this class! -// In particular, getJumpInstrTableEntryBound must always return an upper bound -// on the encoding lengths of the instructions generated by -// getUnconditionalBranch and getTrap. -void X86InstrInfo::getUnconditionalBranch( - MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { - Branch.setOpcode(X86::JMP_1); - Branch.addOperand(MCOperand::createExpr(BranchTarget)); -} - -// This code must remain in sync with getJumpInstrTableEntryBound in this class! -// In particular, getJumpInstrTableEntryBound must always return an upper bound -// on the encoding lengths of the instructions generated by -// getUnconditionalBranch and getTrap. -void X86InstrInfo::getTrap(MCInst &MI) const { - MI.setOpcode(X86::TRAP); -} - -// See getTrap and getUnconditionalBranch for conditions on the value returned -// by this function. -unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { - // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 - // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). - return 5; -} - bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; @@ -7934,6 +9457,28 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } +bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::TCRETURNdi: + case X86::TCRETURNmi: + case X86::TCRETURNri: + case X86::TCRETURNdi64: + case X86::TCRETURNmi64: + case X86::TCRETURNri64: + case X86::TAILJMPd: + case X86::TAILJMPm: + case X86::TAILJMPr: + case X86::TAILJMPd64: + case X86::TAILJMPm64: + case X86::TAILJMPr64: + case X86::TAILJMPm64_REX: + case X86::TAILJMPr64_REX: + return true; + default: + return false; + } +} + namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. @@ -7991,7 +9536,7 @@ namespace { return true; } - const char *getPassName() const override { + StringRef getPassName() const override { return "X86 PIC Global Base Reg Initialization"; } @@ -8105,7 +9650,7 @@ namespace { return Copy; } - const char *getPassName() const override { + StringRef getPassName() const override { return "Local Dynamic TLS Access Clean-up"; } |