summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86InstrInfo.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp3727
1 files changed, 2636 insertions, 1091 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5f0aab9..627b612 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -68,7 +68,7 @@ static cl::opt<unsigned>
UndefRegClearance("undef-reg-clearance",
cl::desc("How many idle instructions we would like before "
"certain undef register reads"),
- cl::init(64), cl::Hidden);
+ cl::init(128), cl::Hidden);
enum {
// Select which memory operand is being unfolded.
@@ -228,12 +228,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SBB64ri32, X86::SBB64mi32, 0 },
{ X86::SBB64ri8, X86::SBB64mi8, 0 },
{ X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SHL16r1, X86::SHL16m1, 0 },
{ X86::SHL16rCL, X86::SHL16mCL, 0 },
{ X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32r1, X86::SHL32m1, 0 },
{ X86::SHL32rCL, X86::SHL32mCL, 0 },
{ X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64r1, X86::SHL64m1, 0 },
{ X86::SHL64rCL, X86::SHL64mCL, 0 },
{ X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8r1, X86::SHL8m1, 0 },
{ X86::SHL8rCL, X86::SHL8mCL, 0 },
{ X86::SHL8ri, X86::SHL8mi, 0 },
{ X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
@@ -335,6 +339,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
@@ -380,6 +385,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
@@ -394,10 +400,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions
+ { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -409,8 +425,27 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+ { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (256-bit versions)
+ { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -421,6 +456,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (128-bit versions)
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -471,26 +515,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
{ X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
- { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
- { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
- { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
- { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
- { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
- { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
- { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
- { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
- { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
- { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
- { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
- { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
{ X86::MOV16rr, X86::MOV16rm, 0 },
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
@@ -499,10 +543,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
@@ -511,51 +556,53 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
{ X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
- { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
- { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
- { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
- { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
+ { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
+ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
+ { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
- { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
- { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
- { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
- { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
- { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
- { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
- { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
- { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
- { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
- { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
- { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
- { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPSSr, X86::RCPSSm, 0 },
- { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
- { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
{ X86::SQRTSDr, X86::SQRTSDm, 0 },
- { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
- { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
{ X86::TEST16rr, X86::TEST16rm, 0 },
{ X86::TEST32rr, X86::TEST32rm, 0 },
{ X86::TEST64rr, X86::TEST64rm, 0 },
@@ -586,46 +633,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PSWAPDrr, X86::PSWAPDrm, 0 },
// AVX 128-bit versions of foldable instructions
- { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
- { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
- { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
- { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
+ { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
- { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
- { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
- { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
- { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
- { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
- { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 },
- { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
- { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
- { X86::VPABSBrr128, X86::VPABSBrm128, 0 },
- { X86::VPABSDrr128, X86::VPABSDrm128, 0 },
- { X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VPABSBrr, X86::VPABSBrm, 0 },
+ { X86::VPABSDrr, X86::VPABSDrm, 0 },
+ { X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
@@ -633,18 +681,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
- { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
- { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
- { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
- { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
- { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
- { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
- { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
- { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
- { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
- { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
- { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
- { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
@@ -661,18 +709,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
// AVX 256-bit foldable instructions
- { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
- { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
{ X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
{ X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
{ X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
@@ -699,31 +748,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- { X86::VPABSBrr256, X86::VPABSBrm256, 0 },
- { X86::VPABSDrr256, X86::VPABSDrm256, 0 },
- { X86::VPABSWrr256, X86::VPABSWrm256, 0 },
- { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 },
- { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 },
- { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 },
- { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 },
- { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 },
- { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 },
- { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 },
- { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 },
+ { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
+ { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
+ { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
+ { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
- { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 },
- { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
- { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 },
- { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 },
- { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
- { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
@@ -817,7 +866,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::TZMSK64rr, X86::TZMSK64rm, 0 },
// AVX-512 foldable instructions
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
+ { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
{ X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
{ X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
@@ -831,12 +885,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
+ { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
+ { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
+ { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
+ { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
+ { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
+ { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
+ { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
+ { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
+ { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
+ { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
+ { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
+ { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
+ { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
// AVX-512 foldable instructions (256-bit versions)
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
@@ -847,12 +920,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
+ { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
+ { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
+ { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
+ { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
+ { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
+ { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
+ { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
+ { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
// AVX-512 foldable instructions (128-bit versions)
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
@@ -863,8 +953,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
+ { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
+ { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
+
// F16C foldable instructions
{ X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
{ X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
@@ -896,9 +1002,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
{ X86::ADDSDrr, X86::ADDSDrm, 0 },
- { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
{ X86::ADDSSrr, X86::ADDSSrm, 0 },
- { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
{ X86::AND16rr, X86::AND16rm, 0 },
@@ -970,24 +1076,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
- { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
{ X86::DIVSSrr, X86::DIVSSrm, 0 },
- { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
-
- // Do not fold Fs* scalar logical op loads because there are no scalar
- // load variants for these instructions. When folded, the load is required
- // to be 128-bits, so the load size would not match.
-
- { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
- { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
- { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
- { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
- { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
- { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
- { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
- { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
@@ -995,34 +1088,42 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL16rr, X86::IMUL16rm, 0 },
{ X86::IMUL32rr, X86::IMUL32rm, 0 },
{ X86::IMUL64rr, X86::IMUL64rm, 0 },
- { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
- { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
- { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
{ X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
- { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
- { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
+ { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
- { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
+ { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },
- { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
+ { X86::MINCSDrr, X86::MINCSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
{ X86::MINSSrr, X86::MINSSrm, 0 },
- { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
+ { X86::MINCSSrr, X86::MINCSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
{ X86::MULSDrr, X86::MULSDrm, 0 },
- { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
{ X86::MULSSrr, X86::MULSSrm, 0 },
- { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
{ X86::OR16rr, X86::OR16rm, 0 },
{ X86::OR32rr, X86::OR32rm, 0 },
{ X86::OR64rr, X86::OR64rm, 0 },
@@ -1067,7 +1168,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PINSRDrr, X86::PINSRDrm, 0 },
{ X86::PINSRQrr, X86::PINSRQrm, 0 },
{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
- { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
+ { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
@@ -1082,7 +1183,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
{ X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
- { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
+ { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
{ X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
{ X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
@@ -1119,8 +1220,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
- { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
- { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
{ X86::SBB32rr, X86::SBB32rm, 0 },
{ X86::SBB64rr, X86::SBB64rm, 0 },
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
@@ -1132,9 +1233,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
{ X86::SUBSDrr, X86::SUBSDrm, 0 },
- { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
- { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
// FIXME: TEST*rr -> swapped operand of TEST*mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
@@ -1240,7 +1341,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX 128-bit versions of foldable instructions
{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
- { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
+ { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE },
{ X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
{ X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
@@ -1250,21 +1351,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
- { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
- { X86::VRCPSSr, X86::VRCPSSm, 0 },
- { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 },
- { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
- { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 },
- { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
- { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 },
- { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
- { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 },
+ { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
- { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
- { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
@@ -1282,48 +1375,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
- { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
- { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
- // Do not fold VFs* loads because there are no scalar load variants for
- // these instructions. When folded, the load is required to be 128-bits, so
- // the load size would not match.
- { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
- { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
- { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
- { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
- { X86::VFvORPDrr, X86::VFvORPDrm, 0 },
- { X86::VFvORPSrr, X86::VFvORPSrm, 0 },
- { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
- { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
- { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
- { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
+ { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
+ { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
+ { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
+ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
+ { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
+ { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
- { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
- { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
+ { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
+ { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
+ { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
{ X86::VMINPDrr, X86::VMINPDrm, 0 },
{ X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
- { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
- { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
- { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
- { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
{ X86::VORPDrr, X86::VORPDrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
@@ -1366,7 +1456,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
- { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
+ { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
@@ -1381,7 +1471,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
{ X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
{ X86::VPMULDQrr, X86::VPMULDQrm, 0 },
- { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
+ { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
{ X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
{ X86::VPMULHWrr, X86::VPMULHWrm, 0 },
{ X86::VPMULLDrr, X86::VPMULLDrm, 0 },
@@ -1418,16 +1508,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
{ X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
{ X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
- { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
- { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
@@ -1458,8 +1558,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
{ X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
{ X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
+ { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
{ X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
{ X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
+ { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
{ X86::VMINPDYrr, X86::VMINPDYrm, 0 },
{ X86::VMINPSYrr, X86::VMINPSYrm, 0 },
{ X86::VMULPDYrr, X86::VMULPDYrm, 0 },
@@ -1520,7 +1624,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
{ X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
{ X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
- { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
+ { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
{ X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
{ X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
{ X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
@@ -1536,7 +1640,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
{ X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
{ X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
- { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
+ { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
{ X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
{ X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
{ X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
@@ -1559,8 +1663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
{ X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
{ X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
- { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 },
- { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 },
{ X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
{ X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
{ X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
@@ -1588,37 +1690,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
@@ -1678,38 +1788,107 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::ADOX64rr, X86::ADOX64rm, 0 },
// AVX-512 foldable instructions
- { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
- { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
- { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, 0 },
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDSDZrr, X86::VADDSDZrm, 0 },
- { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, 0 },
- { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
- { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
- { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
- { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, 0 },
- { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
- { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, 0 },
- { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
- { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
- { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
- { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, 0 },
- { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
- { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, 0 },
- { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
+ { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
+ { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
+ { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
+ { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
+ { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
+ { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
+ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
+ { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
{ X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
- { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
- { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
{ X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
- { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, 0 },
- { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
- { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
- { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
+ { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
+ { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
+ { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
+ { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
+ { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
+ { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
+ { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
{ X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
+ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
+ { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
+ { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
+ { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
+ { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
+ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
+ { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
+ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
+ { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
+ { X86::VORPDZrr, X86::VORPDZrm, 0 },
+ { X86::VORPSZrr, X86::VORPSZrm, 0 },
+ { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
{ X86::VPADDDZrr, X86::VPADDDZrm, 0 },
{ X86::VPADDQZrr, X86::VPADDQZrm, 0 },
- { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
+ { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
+ { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
+ { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
+ { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
+ { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
+ { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
+ { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
+ { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
+ { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
+ { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
+ { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
+ { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
+ { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
+ { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
+ { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
+ { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
+ { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
+ { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
+ { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
+ { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
@@ -1719,31 +1898,297 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VPORDZrr, X86::VPORDZrm, 0 },
+ { X86::VPORQZrr, X86::VPORQZrm, 0 },
+ { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
{ X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
{ X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
{ X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
{ X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
{ X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
{ X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
+ { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
+ { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
+ { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
+ { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
+ { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
+ { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
+ { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
+ { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
+ { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
+ { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
+ { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
+ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
+ { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
+ { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
- { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
- { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
- { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
-
- // AVX-512{F,VL} foldable instructions
- { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
+ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
+ { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
+ { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
+ { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
// AVX-512{F,VL} foldable instructions
{ X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
{ X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
{ X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
{ X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+ { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
+ { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
+ { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
+ { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
+ { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
+ { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
+ { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
+ { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
+ { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
+ { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
+ { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
+ { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
+ { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
+ { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
+ { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
+ { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
+ { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
+ { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
+ { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
+ { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
+ { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
+ { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
+ { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
+ { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
+ { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
+ { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
+ { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
+ { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
+ { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
+ { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
+ { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
+ { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
+ { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
+ { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
+ { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
+ { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
+ { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
+ { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
+ { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
+ { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
+ { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
+ { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
+ { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
+ { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
+ { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
+ { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
+ { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
+ { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
+ { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
+ { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
+ { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
+ { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
+ { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
+ { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
+ { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
+ { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
+ { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
+ { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
+ { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
+ { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
+ { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
+ { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
+ { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
+ { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
+ { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
+ { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
+ { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
+ { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
+ { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
+ { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
+ { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
+ { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
+ { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
+ { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
+ { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
+ { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
+ { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
+ { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
+ { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
+ { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
+ { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
+ { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
+ { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
+ { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
+ { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
+ { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
+ { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
+ { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
+ { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
+ { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
+ { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
+ { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
+ { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
+ { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
+ { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
+ { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
+ { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
+ { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
+ { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
+ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
+ { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
+ { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
+ { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
+ { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
+ { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
+ { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
+ { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
+ { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
+ { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
+ { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
+ { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
+ { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
+ { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
+ { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
+ { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
+ { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
+ { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
+ { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
+ { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
+ { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
+ { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
+ { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
+ { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
+ { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
+ { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
// AES foldable instructions
{ X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
@@ -1773,170 +2218,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
}
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
- // FMA foldable instructions
- { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
- { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
- { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
- { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
- { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
- { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
- { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
- { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
- { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE },
- { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE },
- { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE },
- { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE },
- { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
-
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
@@ -1947,11 +2269,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
{ X86::VPPERMrrr, X86::VPPERMrrm, 0 },
- // AVX-512 VPERMI instructions with 3 source operands.
- { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
- { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
- { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
- { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ // AVX-512 instructions with 3 source operands.
{ X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
{ X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
{ X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
@@ -1961,45 +2279,349 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
- // AVX-512 arithmetic instructions
- { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
+ { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
+ { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
+ { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
+ { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
+ { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
+ { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
+
+ // AVX-512VL 256-bit instructions with 3 source operands.
+ { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
+ { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
+ { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
+ { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
+ { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
+ { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
+ { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
+ { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
+ { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
+ { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
+ { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
+ { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
+
+ // AVX-512VL 128-bit instructions with 3 source operands.
+ { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
+ { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
+ { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
+ { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
+ { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
+ { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
+ { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
+ { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
+ { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
+ { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
+ { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
+ { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
+
+ // AVX-512 masked instructions
{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
- { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
- { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
- { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
- { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
- { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
+ { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
+ { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
+ { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
+ { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
+ { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
- { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
- { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
- { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
+ { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
+ { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
+ { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
+ { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
+ { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
+ { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
+ { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
+ { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
+ { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
- // AVX-512{F,VL} arithmetic instructions 256-bit
- { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
+ { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
+ { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
+ { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
+ { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
+ { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
+ { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
+ { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
+ { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
+ { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
+ { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
+ { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
+ { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
+ { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
+ { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
+ { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
+ { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
+ { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
+ { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
+ { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
+ { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
+ { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
+ { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
+ { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
+ { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
+ { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
+ { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
+ { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
+ { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
+ { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
+ { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
+ { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
+ { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
+ { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
+ { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
+ { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
+ { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
+ { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
+ { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
+ { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
+ { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
+ { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
+ { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
+ { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 256-bit
{ X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
- { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
- { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
- { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
- { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
- { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
+ { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
+ { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
+ { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
+ { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
{ X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
- { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
- { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
- { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
+ { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
+ { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
+ { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
+ { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
+ { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
{ X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
- // AVX-512{F,VL} arithmetic instructions 128-bit
- { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
+ { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
+ { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
+ { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
+ { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
+ { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
+ { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
+ { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
+ { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
+ { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
+ { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
+ { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
+ { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
+ { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
+ { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
+ { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
+ { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
+ { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
+ { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
+ { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
+ { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
+ { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
+ { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
+ { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
+ { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
+ { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
+ { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 128-bit
{ X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
- { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
- { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
- { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
- { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
- { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
+ { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
+ { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
+ { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
+ { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
{ X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
- { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
- { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
+ { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
{ X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
- { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
+ { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
+ { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
+ { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
+ { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
+ { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
+ { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
+ { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
+ { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
+ { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
+ { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
+ { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
+ { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
+ { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
+ { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
+ { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
+ { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
+ { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
+ { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
+ { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
+ { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
+ { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
+ { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
+ { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
+ { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
+ { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
+ { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
+ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
+ { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
+ { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
+ { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
+ { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
+ { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
+ { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
+ { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
+ { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
+ { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
+ { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
+ { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
+ { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
+ { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
+ { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
+ { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
+ { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
+ { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
+ { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
+ { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
+ { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
+ { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
+ { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
+ { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
+ { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
+ { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -2008,47 +2630,348 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// Index 3, folded load
Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
+ auto I = X86InstrFMA3Info::rm_begin();
+ auto E = X86InstrFMA3Info::rm_end();
+ for (; I != E; ++I) {
+ if (!I.getGroup()->isKMasked()) {
+ // Intrinsic forms need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+ }
+ }
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
- // AVX-512 foldable instructions
- { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ // AVX-512 foldable masked instructions
{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
- { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
- { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
- { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
- { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
- { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
+ { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
+ { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
+ { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
+ { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
+ { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
- { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
- { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
- { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
+ { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
+ { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
+ { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
+ { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
+ { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
+ { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
+ { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
+ { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
+ { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
- // AVX-512{F,VL} foldable instructions 256-bit
- { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
+ { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
+ { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
+ { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
+ { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
+ { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
+ { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
+ { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
+ { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
+ { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
+ { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
+ { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
+ { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
+ { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
+ { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
+ { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
+ { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
+ { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
+ { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
+ { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
+ { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
+ { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
+ { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
+ { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
+ { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
+ { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
+ { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
+ { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
+ { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
+ { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
+ { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
+ { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
+ { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
+ { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
+ { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
+ { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
+ { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
+ { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
+ { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
+ { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
+ { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
+ { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
+ { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
+ { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
+ { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
+ { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
+ { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
+ { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
+ { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
+ { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
+ { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
+ { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
+ { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
+ { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
+ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
+
+ // AVX-512{F,VL} foldable masked instructions 256-bit
{ X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
- { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
- { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
- { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
- { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
- { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
+ { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
+ { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
+ { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
+ { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
+ { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
{ X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
- { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
- { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
- { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
+ { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
+ { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
+ { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
+ { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
+ { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
{ X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
+ { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
+ { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
+ { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
+ { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
+ { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
+ { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
+ { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
+ { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
+ { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
+ { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
+ { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
+ { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
+ { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
+ { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
+ { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
+ { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
+ { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
+ { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
+ { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
+ { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
+ { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
+ { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
+ { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
+ { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
+ { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
+ { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
+ { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
+ { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
+ { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
+ { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
+ { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
+ { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
+ { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
+ { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
+ { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
+ { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
+ { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
+ { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
+
// AVX-512{F,VL} foldable instructions 128-bit
- { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
{ X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
- { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
- { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
- { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
- { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
- { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
+ { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
+ { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
+ { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
+ { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
+ { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
{ X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
- { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
- { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
+ { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
{ X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
- { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
+ { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
+ { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
+ { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
+ { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
+ { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
+ { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
+ { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
+ { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
+ { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
+ { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
+ { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
+ { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
+ { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
+ { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
+ { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
+ { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
+ { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
+ { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
+ { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
+ { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
+ { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
+ { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
+ { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
+ { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
+ { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
+ { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
+ { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
+ { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
+ { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
+ { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
+ { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
+ { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
+ { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
+ { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
+ { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
+ { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
+ { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
+ { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
+ { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
+ { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
+
+ // 512-bit three source instructions with zero masking.
+ { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
+ { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
+ { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
+ { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
+ { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
+ { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
+ { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
+ { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
+ { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
+ { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
+ { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
+ { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
+ { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
+
+ // 256-bit three source instructions with zero masking.
+ { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
+ { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
+ { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
+ { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
+ { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
+ { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
+ { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
+ { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
+ { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
+ { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
+ { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
+ { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
+ { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
+
+ // 128-bit three source instructions with zero masking.
+ { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
+ { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
+ { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
+ { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
+ { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
+ { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
+ { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
+ { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
+ { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
+ { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
+ { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
+ { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
+ { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
@@ -2057,21 +2980,35 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// Index 4, folded load
Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
+ for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
+ if (I.getGroup()->isKMasked()) {
+ // Intrinsics need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
+ }
+ }
+ }
}
void
X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
MemOp2RegOpTableType &M2RTable,
uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
- if ((Flags & TB_NO_FORWARD) == 0) {
- assert(!R2MTable.count(RegOp) && "Duplicate entry!");
- R2MTable[RegOp] = std::make_pair(MemOp, Flags);
- }
- if ((Flags & TB_NO_REVERSE) == 0) {
- assert(!M2RTable.count(MemOp) &&
- "Duplicated entries in unfolding maps?");
- M2RTable[MemOp] = std::make_pair(RegOp, Flags);
- }
+ if ((Flags & TB_NO_FORWARD) == 0) {
+ assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+ R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+ }
+ if ((Flags & TB_NO_REVERSE) == 0) {
+ assert(!M2RTable.count(MemOp) &&
+ "Duplicated entries in unfolding maps?");
+ M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+ }
}
bool
@@ -2235,9 +3172,13 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::VMOVAPSZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
@@ -2305,9 +3246,13 @@ static bool isFrameStoreOpcode(int Opcode) {
case X86::VMOVUPSZmr:
case X86::VMOVUPSZ128mr:
case X86::VMOVUPSZ256mr:
+ case X86::VMOVUPSZ128mr_NOVLX:
+ case X86::VMOVUPSZ256mr_NOVLX:
case X86::VMOVAPSZmr:
case X86::VMOVAPSZ128mr:
case X86::VMOVAPSZ256mr:
+ case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVAPSZ256mr_NOVLX:
case X86::VMOVUPDZmr:
case X86::VMOVUPDZ128mr:
case X86::VMOVUPDZ256mr:
@@ -2409,6 +3354,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
switch (MI.getOpcode()) {
default: break;
case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
@@ -2418,6 +3364,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
@@ -2425,25 +3372,27 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
// AVX-512
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVAPSZrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
@@ -2463,15 +3412,20 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPDZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- MI.isInvariantLoad(AA)) {
+ MI.isDereferenceableInvariantLoad(AA)) {
unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
@@ -2694,24 +3648,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
ImplicitOp.setImplicit();
NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
- MachineBasicBlock::LivenessQueryResult LQR =
- MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
-
- switch (LQR) {
- case MachineBasicBlock::LQR_Unknown:
- // We can't give sane liveness flags to the instruction, abandon LEA
- // formation.
- return false;
- case MachineBasicBlock::LQR_Live:
- isKill = MI.killsRegister(SrcReg);
- isUndef = false;
- break;
- default:
- // The physreg itself is dead, so we have to use it as an <undef>.
- isKill = false;
- isUndef = true;
- break;
- }
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
} else {
// Virtual register of the wrong class, we have to create a temporary 64-bit
// vreg to feed into the LEA.
@@ -3079,7 +4017,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.addOperand(Dest)
.addOperand(Src),
- MI.getOperand(2).getImm());
+ MI.getOperand(2));
break;
case X86::ADD32ri:
case X86::ADD32ri8:
@@ -3102,7 +4040,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
- NewMI = addOffset(MIB, MI.getOperand(2).getImm());
+ NewMI = addOffset(MIB, MI.getOperand(2));
break;
}
case X86::ADD16ri:
@@ -3116,7 +4054,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addOperand(Src),
- MI.getOperand(2).getImm());
+ MI.getOperand(2));
break;
}
@@ -3133,156 +4071,236 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
return NewMI;
}
-/// Returns true if the given instruction opcode is FMA3.
-/// Otherwise, returns false.
-/// The second parameter is optional and is used as the second return from
-/// the function. It is set to true if the given instruction has FMA3 opcode
-/// that is used for lowering of scalar FMA intrinsics, and it is set to false
-/// otherwise.
-static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
- if (IsIntrinsic)
- *IsIntrinsic = false;
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
- switch (Opcode) {
- case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
- case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
- case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
- case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
- case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
- case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
- case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
- case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
-
- case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
- case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
- case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
- case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
- case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
- case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
- case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
- case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
-
- case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
- case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
- case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
- case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
- case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
- case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
- case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
- case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
-
- case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
- case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
- case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
- case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
- case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
- case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
- case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
- case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
-
- case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
- case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
- case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
- case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
- case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
- case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
- case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
- case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
- case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
- case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
- case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
- case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
- case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
- case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
- case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
- case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
-
- case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
- case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
- case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
- case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
- case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
- case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
- case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
- case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
-
- case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
- case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
- case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
- case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
- case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
- case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
- case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
- case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
- case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
- case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
- case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
- case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
- case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
- case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
- case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
- case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
-
- case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
- case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
- case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
- case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
- case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
- case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
- case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
- case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
-
- case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
- case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
- case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
- case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
- case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
- case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
- case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
- case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
- case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
- case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
- case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
- case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
- case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
- case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
- case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
- case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
- return true;
+ unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand cannot be commuted.
+ if (SrcOpIdx1 == 2)
+ return -1;
+
+ // For k-zero-masked operations it is Ok to commute the first vector
+ // operand.
+ // For regular k-masked operations a conservative choice is done as the
+ // elements of the first vector operand, for which the corresponding bit
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
+ // TODO/FIXME: The commute still may be legal if it is known that the
+ // k-mask operand is set to either all ones or all zeroes.
+ // It is also Ok to commute the 1st operand if all users of MI use only
+ // the elements enabled by the k-mask operand. For example,
+ // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+ // : v1[i];
+ // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+ // // Ok, to commute v1 in FMADD213PSZrk.
+ if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
+ return -1;
+ Op2++;
+ Op3++;
+ }
+
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+ return 0;
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+ return 1;
+ if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+ return 2;
+ return -1;
+}
- case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int:
- case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int:
- case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int:
- case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int:
- case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
- case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
- case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
- case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
-
- case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int:
- case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int:
- case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int:
- case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int:
- case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
- case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
- case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
- case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
-
- case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int:
- case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int:
- case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int:
- case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int:
- case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
- case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
- case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
- case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
- if (IsIntrinsic)
- *IsIntrinsic = true;
- return true;
- default:
- return false;
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+ const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
+ return 0;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ unsigned FMAForms[3];
+ if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+ FMAForms[0] = FMA3Group.getReg132Opcode();
+ FMAForms[1] = FMA3Group.getReg213Opcode();
+ FMAForms[2] = FMA3Group.getReg231Opcode();
+ } else {
+ FMAForms[0] = FMA3Group.getMem132Opcode();
+ FMAForms[1] = FMA3Group.getMem213Opcode();
+ FMAForms[2] = FMA3Group.getMem231Opcode();
+ }
+ unsigned FormIndex;
+ for (FormIndex = 0; FormIndex < 3; FormIndex++)
+ if (Opc == FMAForms[FormIndex])
+ break;
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FMAForms[FormIndex];
+}
+
+static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return false;
+
+ // For each case we need to swap two pairs of bits in the final immediate.
+ static const uint8_t SwapMasks[3][4] = {
+ { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+ { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+ { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+ };
+
+ uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+ // Clear out the bits we are swapping.
+ uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+ SwapMasks[Case][2] | SwapMasks[Case][3]);
+ // If the immediate had a bit of the pair set, then set the opposite bit.
+ if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+ if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+ if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+ if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+ MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+
+ return true;
+}
+
+// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// commuted.
+static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
+#define VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
+ case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
+ case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
+ case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
+ case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
+ case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
+ case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
+ case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
+ case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
+ case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
+ case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
+ case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix) \
+ VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
+ case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
+ case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
+ case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
+ case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
+ case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
+
+ switch (Opcode) {
+ default: return false;
+ VPERM_CASES(B)
+ VPERM_CASES_BROADCAST(D)
+ VPERM_CASES_BROADCAST(PD)
+ VPERM_CASES_BROADCAST(PS)
+ VPERM_CASES_BROADCAST(Q)
+ VPERM_CASES(W)
+ return true;
}
- llvm_unreachable("Opcode not handled by the switch");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
+// from the I opcod to the T opcode and vice versa.
+static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
+#define VPERM_CASES(Orig, New) \
+ case X86::Orig##128rr: return X86::New##128rr; \
+ case X86::Orig##128rrkz: return X86::New##128rrkz; \
+ case X86::Orig##128rm: return X86::New##128rm; \
+ case X86::Orig##128rmkz: return X86::New##128rmkz; \
+ case X86::Orig##256rr: return X86::New##256rr; \
+ case X86::Orig##256rrkz: return X86::New##256rrkz; \
+ case X86::Orig##256rm: return X86::New##256rm; \
+ case X86::Orig##256rmkz: return X86::New##256rmkz; \
+ case X86::Orig##rr: return X86::New##rr; \
+ case X86::Orig##rrkz: return X86::New##rrkz; \
+ case X86::Orig##rm: return X86::New##rm; \
+ case X86::Orig##rmkz: return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New) \
+ VPERM_CASES(Orig, New) \
+ case X86::Orig##128rmb: return X86::New##128rmb; \
+ case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
+ case X86::Orig##256rmb: return X86::New##256rmb; \
+ case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
+ case X86::Orig##rmb: return X86::New##rmb; \
+ case X86::Orig##rmbkz: return X86::New##rmbkz;
+
+ switch (Opcode) {
+ VPERM_CASES(VPERMI2B, VPERMT2B)
+ VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
+ VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+ VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+ VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
+ VPERM_CASES(VPERMI2W, VPERMT2W)
+ VPERM_CASES(VPERMT2B, VPERMI2B)
+ VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
+ VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+ VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+ VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
+ VPERM_CASES(VPERMT2W, VPERMI2W)
+ }
+
+ llvm_unreachable("Unreachable!");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
}
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -3352,6 +4370,39 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr:{
+ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
+ if (!Subtarget.hasSSE41())
+ return nullptr;
+
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
+
+ // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
+ // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
+ unsigned VR128 = MRI.createVirtualRegister(VR128RC);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
+ VR128)
+ .addReg(MI.getOperand(2).getReg());
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.getOperand(2).setReg(VR128);
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
@@ -3364,12 +4415,24 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
- case X86::VCMPPSYrri: {
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
@@ -3383,6 +4446,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return nullptr;
}
}
+ case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
+ case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
+ case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
+ case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:
@@ -3390,6 +4484,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
case 0x00: Imm = 0x02; break; // LT -> GT
case 0x01: Imm = 0x03; break; // LE -> GE
case 0x02: Imm = 0x00; break; // GT -> LT
@@ -3398,7 +4493,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case 0x05: // NE
case 0x06: // FALSE
case 0x07: // TRUE
- default:
break;
}
auto &WorkingMI = cloneIfNew(MI);
@@ -3417,6 +4511,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr: {
+ if (!Subtarget.hasSSE2())
+ return nullptr;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
@@ -3490,9 +4600,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
- default:
- if (isFMA3(MI.getOpcode())) {
- unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+ auto &WorkingMI = cloneIfNew(MI);
+ if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
+ return nullptr;
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ default: {
+ if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
+ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group) {
+ unsigned Opc =
+ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
if (Opc == 0)
return nullptr;
auto &WorkingMI = cloneIfNew(MI);
@@ -3503,22 +4648,54 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
+ }
}
-bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const {
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+ const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
- unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+ if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
+ return false;
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+}
+
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ unsigned FirstCommutableVecOp = 1;
+ unsigned LastCommutableVecOp = 3;
+ unsigned KMaskOp = 0;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand has index = 2 for masked and zero-masked operations.
+ KMaskOp = 2;
+
+ // The operand with index = 1 is used as a source for those elements for
+ // which the corresponding bit in the k-mask is set to 0.
+ if (X86II::isKMergeMasked(TSFlags))
+ FirstCommutableVecOp = 3;
+
+ LastCommutableVecOp++;
+ }
+
+ if (isMem(MI, LastCommutableVecOp))
+ LastCommutableVecOp--;
// Only the first RegOpsNum operands are commutable.
// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
// that the operand is not specified/fixed.
if (SrcOpIdx1 != CommuteAnyOperandIndex &&
- (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+ (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+ SrcOpIdx1 == KMaskOp))
return false;
if (SrcOpIdx2 != CommuteAnyOperandIndex &&
- (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+ (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+ SrcOpIdx2 == KMaskOp))
return false;
// Look for two different register operands assumed to be commutable
@@ -3533,7 +4710,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
if (SrcOpIdx1 == SrcOpIdx2)
// Both of operands are not fixed. By default set one of commutable
// operands to the last register operand of the instruction.
- CommutableOpIdx2 = RegOpsNum;
+ CommutableOpIdx2 = LastCommutableVecOp;
else if (SrcOpIdx2 == CommuteAnyOperandIndex)
// Only one of operands is not fixed.
CommutableOpIdx2 = SrcOpIdx1;
@@ -3541,7 +4718,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
- for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+ for (CommutableOpIdx1 = LastCommutableVecOp;
+ CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+ // Just ignore and skip the k-mask operand.
+ if (CommutableOpIdx1 == KMaskOp)
+ continue;
+
// The commuted operands must have different registers.
// Otherwise, the commute transformation does not change anything and
// is useless then.
@@ -3550,7 +4732,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
}
// No appropriate commutable operands were found.
- if (CommutableOpIdx1 == 0)
+ if (CommutableOpIdx1 < FirstCommutableVecOp)
return false;
// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
@@ -3560,208 +4742,34 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
return false;
}
- // Check if we can adjust the opcode to preserve the semantics when
- // commute the register operands.
- return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
-}
-
-unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
- MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const {
- unsigned Opc = MI.getOpcode();
-
- // Define the array that holds FMA opcodes in groups
- // of 3 opcodes(132, 213, 231) in each group.
- static const uint16_t RegularOpcodeGroups[][3] = {
- { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
- { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
- { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
- { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
- { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
- { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
- { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
- { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
- { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
- { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
- { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
- { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
-
- { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
- { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
- { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
- { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
- { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
- { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
- { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
- { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
- { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
- { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
- { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
- { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
-
- { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
- { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
- { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
- { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
- { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
- { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
- { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
- { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
- { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
- { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
- { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
- { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
-
- { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
- { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
- { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
- { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
- { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
- { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
- { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
- { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
- { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
- { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
- { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
- { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
-
- { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
- { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
- { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
- { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
- { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
- { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
- { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
- { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
-
- { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
- { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
- { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
- { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
- { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
- { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
- { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
- { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
- };
-
- // Define the array that holds FMA*_Int opcodes in groups
- // of 3 opcodes(132, 213, 231) in each group.
- static const uint16_t IntrinOpcodeGroups[][3] = {
- { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int },
- { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int },
- { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int },
- { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int },
-
- { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int },
- { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int },
- { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int },
- { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int },
-
- { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
- { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
- { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
- { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
-
- { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
- { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
- { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
- { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
- };
-
- const unsigned Form132Index = 0;
- const unsigned Form213Index = 1;
- const unsigned Form231Index = 2;
- const unsigned FormsNum = 3;
-
- bool IsIntrinOpcode;
- isFMA3(Opc, &IsIntrinOpcode);
-
- size_t GroupsNum;
- const uint16_t (*OpcodeGroups)[3];
- if (IsIntrinOpcode) {
- GroupsNum = array_lengthof(IntrinOpcodeGroups);
- OpcodeGroups = IntrinOpcodeGroups;
- } else {
- GroupsNum = array_lengthof(RegularOpcodeGroups);
- OpcodeGroups = RegularOpcodeGroups;
- }
-
- const uint16_t *FoundOpcodesGroup = nullptr;
- size_t FormIndex;
-
- // Look for the input opcode in the corresponding opcodes table.
- for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
- ++GroupIndex) {
- for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
- if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
- FoundOpcodesGroup = OpcodeGroups[GroupIndex];
- break;
- }
- }
- }
-
- // The input opcode does not match with any of the opcodes from the tables.
- // The unsupported FMA opcode must be added to one of the two opcode groups
- // defined above.
- assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
-
- // Put the lowest index to SrcOpIdx1 to simplify the checks below.
- if (SrcOpIdx1 > SrcOpIdx2)
- std::swap(SrcOpIdx1, SrcOpIdx2);
-
- // TODO: Commuting the 1st operand of FMA*_Int requires some additional
- // analysis. The commute optimization is legal only if all users of FMA*_Int
- // use only the lowest element of the FMA*_Int instruction. Such analysis are
- // not implemented yet. So, just return 0 in that case.
- // When such analysis are available this place will be the right place for
- // calling it.
- if (IsIntrinOpcode && SrcOpIdx1 == 1)
- return 0;
-
- unsigned Case;
- if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
- Case = 0;
- else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
- Case = 1;
- else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
- Case = 2;
- else
- return 0;
-
- // Define the FMA forms mapping array that helps to map input FMA form
- // to output FMA form to preserve the operation semantics after
- // commuting the operands.
- static const unsigned FormMapping[][3] = {
- // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
- // FMA132 A, C, b; ==> FMA231 C, A, b;
- // FMA213 B, A, c; ==> FMA213 A, B, c;
- // FMA231 C, A, b; ==> FMA132 A, C, b;
- { Form231Index, Form213Index, Form132Index },
- // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
- // FMA132 A, c, B; ==> FMA132 B, c, A;
- // FMA213 B, a, C; ==> FMA231 C, a, B;
- // FMA231 C, a, B; ==> FMA213 B, a, C;
- { Form132Index, Form231Index, Form213Index },
- // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
- // FMA132 a, C, B; ==> FMA213 a, B, C;
- // FMA213 b, A, C; ==> FMA132 b, C, A;
- // FMA231 c, A, B; ==> FMA231 c, B, A;
- { Form213Index, Form132Index, Form231Index }
- };
-
- // Everything is ready, just adjust the FMA opcode and return it.
- FormIndex = FormMapping[Case][FormIndex];
- return FoundOpcodesGroup[FormIndex];
+ return true;
}
bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+ if (!Desc.isCommutable())
+ return false;
+
switch (MI.getOpcode()) {
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
- case X86::VCMPPSYrri: {
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
@@ -3776,9 +4784,73 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
}
return false;
}
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (Subtarget.hasSSE41())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ }
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
default:
- if (isFMA3(MI.getOpcode()))
- return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group)
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+
+ // Handled masked instructions since we need to skip over the mask input
+ // and the preserved input.
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // First assume that the first input is the mask operand and skip past it.
+ unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
+ unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
+ // Check if the first input is tied. If there isn't one then we only
+ // need to skip the mask operand which we did above.
+ if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
+ MCOI::TIED_TO) != -1)) {
+ // If this is zero masking instruction with a tied operand, we need to
+ // move the first index back to the first input since this must
+ // be a 3 input instruction and we want the first two non-mask inputs.
+ // Otherwise this is a 2 input instruction with a preserved input and
+ // mask, so we need to move the indices to skip one more input.
+ if (Desc.TSFlags & X86II::EVEX_Z)
+ --CommutableOpIdx1;
+ else {
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ }
+ }
+
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}
return false;
@@ -4296,7 +5368,10 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
return true;
}
-unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
@@ -4316,15 +5391,17 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
return Count;
}
-unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const {
+ const DebugLoc &DL,
+ int *BytesAdded) const {
// Shouldn't be a fall through.
- assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
assert((Cond.size() == 1 || Cond.size() == 0) &&
"X86 branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
if (Cond.empty()) {
// Unconditional branch?
@@ -4430,16 +5507,63 @@ static bool isHReg(unsigned Reg) {
}
// Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
const X86Subtarget &Subtarget) {
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ // SrcReg(MaskReg) -> DestReg(GR64)
+ // SrcReg(MaskReg) -> DestReg(GR32)
+ // SrcReg(MaskReg) -> DestReg(GR16)
+ // SrcReg(MaskReg) -> DestReg(GR8)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(SrcReg)) {
+ if (X86::GR64RegClass.contains(DestReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQrk;
+ }
+ if (X86::GR32RegClass.contains(DestReg))
+ return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
+ if (X86::GR16RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVWrk;
+ }
+ if (X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
+ }
+ }
+
+ // SrcReg(GR64) -> DestReg(MaskReg)
+ // SrcReg(GR32) -> DestReg(MaskReg)
+ // SrcReg(GR16) -> DestReg(MaskReg)
+ // SrcReg(GR8) -> DestReg(MaskReg)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(DestReg)) {
+ if (X86::GR64RegClass.contains(SrcReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQkr;
+ }
+ if (X86::GR32RegClass.contains(SrcReg))
+ return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
+ if (X86::GR16RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVWkr;
+ }
+ if (X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
+ }
+ }
+
// SrcReg(VR128) -> DestReg(GR64)
// SrcReg(VR64) -> DestReg(GR64)
// SrcReg(GR64) -> DestReg(VR128)
// SrcReg(GR64) -> DestReg(VR64)
- bool HasAVX = Subtarget.hasAVX();
- bool HasAVX512 = Subtarget.hasAVX512();
if (X86::GR64RegClass.contains(DestReg)) {
if (X86::VR128XRegClass.contains(SrcReg))
// Copy from a VR128 register to a GR64 register.
@@ -4479,96 +5603,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return 0;
}
-static bool isMaskRegClass(const TargetRegisterClass *RC) {
- // All KMASK RegClasses hold the same k registers, can be tested against anyone.
- return X86::VK16RegClass.hasSubClassEq(RC);
-}
-
-static bool MaskRegClassContains(unsigned Reg) {
- // All KMASK RegClasses hold the same k registers, can be tested against anyone.
- return X86::VK16RegClass.contains(Reg);
-}
-
-static bool GRRegClassContains(unsigned Reg) {
- return X86::GR64RegClass.contains(Reg) ||
- X86::GR32RegClass.contains(Reg) ||
- X86::GR16RegClass.contains(Reg) ||
- X86::GR8RegClass.contains(Reg);
-}
-static
-unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
- if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
- DestReg = getX86SubSuperRegister(DestReg, 32);
- return X86::KMOVBrk;
- }
- if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
- SrcReg = getX86SubSuperRegister(SrcReg, 32);
- return X86::KMOVBkr;
- }
- return 0;
-}
-
-static
-unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
- if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
- return X86::KMOVQkk;
- if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
- return X86::KMOVDrk;
- if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
- return X86::KMOVQrk;
- if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
- return X86::KMOVDkr;
- if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
- return X86::KMOVQkr;
- return 0;
-}
-
-static
-unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
- const X86Subtarget &Subtarget)
-{
- if (Subtarget.hasDQI())
- if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
- return Opc;
- if (Subtarget.hasBWI())
- if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
- return Opc;
- if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
- if (Subtarget.hasVLX())
- return X86::VMOVAPSZ128rr;
- DestReg = get512BitSuperRegister(DestReg);
- SrcReg = get512BitSuperRegister(SrcReg);
- return X86::VMOVAPSZrr;
- }
- if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
- if (Subtarget.hasVLX())
- return X86::VMOVAPSZ256rr;
- DestReg = get512BitSuperRegister(DestReg);
- SrcReg = get512BitSuperRegister(SrcReg);
- return X86::VMOVAPSZrr;
- }
- if (X86::VR512RegClass.contains(DestReg, SrcReg))
- return X86::VMOVAPSZrr;
- if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
- return X86::KMOVWkk;
- if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
- SrcReg = getX86SubSuperRegister(SrcReg, 32);
- return X86::KMOVWkr;
- }
- if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
- DestReg = getX86SubSuperRegister(DestReg, 32);
- return X86::KMOVWrk;
- }
- return 0;
-}
-
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
// First deal with the normal symmetric copies.
bool HasAVX = Subtarget.hasAVX();
- bool HasAVX512 = Subtarget.hasAVX512();
+ bool HasVLX = Subtarget.hasVLX();
unsigned Opc = 0;
if (X86::GR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV64rr;
@@ -4590,12 +5631,41 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
- else if (HasAVX512)
- Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
- else if (X86::VR128RegClass.contains(DestReg, SrcReg))
- Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
- else if (X86::VR256RegClass.contains(DestReg, SrcReg))
- Opc = X86::VMOVAPSYrr;
+ else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ128rr;
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ256rr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSZrr;
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ else if (X86::VK16RegClass.contains(DestReg, SrcReg))
+ Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
if (!Opc)
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
@@ -4708,37 +5778,15 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
llvm_unreachable("Cannot emit physreg copy instruction");
}
-static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC,
- bool load) {
- switch (RC->getSize()) {
- default:
- llvm_unreachable("Unknown spill size");
- case 2:
- return load ? X86::KMOVWkm : X86::KMOVWmk;
- case 4:
- return load ? X86::KMOVDkm : X86::KMOVDmk;
- case 8:
- return load ? X86::KMOVQkm : X86::KMOVQmk;
- }
-}
-
static unsigned getLoadStoreRegOpcode(unsigned Reg,
const TargetRegisterClass *RC,
bool isStackAligned,
const X86Subtarget &STI,
bool load) {
- if (STI.hasAVX512()) {
- if (isMaskRegClass(RC))
- return getLoadStoreMaskRegOpcode(RC, load);
- if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
- return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
- if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
- return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
- if (X86::VR512RegClass.hasSubClassEq(RC))
- return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
- }
-
bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
switch (RC->getSize()) {
default:
llvm_unreachable("Unknown spill size");
@@ -4751,69 +5799,85 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
return load ? X86::MOV8rm : X86::MOV8mr;
case 2:
+ if (X86::VK16RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVWkm : X86::KMOVWmk;
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
return load ? X86::MOV16rm : X86::MOV16mr;
case 4:
if (X86::GR32RegClass.hasSubClassEq(RC))
return load ? X86::MOV32rm : X86::MOV32mr;
- if (X86::FR32RegClass.hasSubClassEq(RC))
+ if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
- (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+ (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ if (X86::VK32RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVDkm : X86::KMOVDmk;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
return load ? X86::MOV64rm : X86::MOV64mr;
- if (X86::FR64RegClass.hasSubClassEq(RC))
+ if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
- (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+ (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
if (X86::VR64RegClass.hasSubClassEq(RC))
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ if (X86::VK64RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVQkm : X86::KMOVQmk;
llvm_unreachable("Unknown 8-byte regclass");
case 10:
assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
return load ? X86::LD_Fp80m : X86::ST_FpP80m;
case 16: {
- assert((X86::VR128RegClass.hasSubClassEq(RC) ||
- X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
+ assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
// If stack is realigned we can use aligned stores.
- if (X86::VR128RegClass.hasSubClassEq(RC)) {
- if (isStackAligned)
- return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
- : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
- else
- return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
- : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
- }
- assert(STI.hasVLX() && "Using extended register requires VLX");
if (isStackAligned)
- return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr;
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVAPSrm :
+ X86::MOVAPSrm):
+ (HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVAPSmr :
+ X86::MOVAPSmr);
else
- return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr;
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVUPSrm :
+ X86::MOVUPSrm):
+ (HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVUPSmr :
+ X86::MOVUPSmr);
}
case 32:
- assert((X86::VR256RegClass.hasSubClassEq(RC) ||
- X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
+ assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
- if (X86::VR256RegClass.hasSubClassEq(RC)) {
- if (isStackAligned)
- return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
- else
- return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
- }
- assert(STI.hasVLX() && "Using extended register requires VLX");
if (isStackAligned)
- return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr;
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ256rm :
+ HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+ X86::VMOVAPSYrm) :
+ (HasVLX ? X86::VMOVAPSZ256mr :
+ HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+ X86::VMOVAPSYmr);
else
- return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr;
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ256rm :
+ HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+ X86::VMOVUPSYrm) :
+ (HasVLX ? X86::VMOVUPSZ256mr :
+ HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+ X86::VMOVUPSYmr);
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
- assert(STI.hasVLX() && "Using 512-bit register requires AVX512");
+ assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
if (isStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
@@ -4851,8 +5915,7 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
Offset = DispMO.getImm();
- return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
- X86::NoRegister;
+ return true;
}
static unsigned getStoreRegOpcode(unsigned SrcReg,
@@ -4876,7 +5939,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
- assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+ assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned =
@@ -4954,6 +6017,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP8ri:
+ if (!MI.getOperand(1).isImm())
+ return false;
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = 0;
CmpMask = ~0;
@@ -4985,6 +6050,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
+ if (!MI.getOperand(2).isImm())
+ return false;
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
@@ -5263,9 +6330,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// If the definition is in this basic block, RE points to the definition;
// otherwise, RE is the rend of the basic block.
MachineBasicBlock::reverse_iterator
- RI = MachineBasicBlock::reverse_iterator(I),
+ RI = ++I.getReverse(),
RE = CmpInstr.getParent() == MI->getParent()
- ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */
+ ? Def.getReverse() /* points to MI */
: CmpInstr.getParent()->rend();
MachineInstr *Movr0Inst = nullptr;
for (; RI != RE; ++RI) {
@@ -5411,9 +6478,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (Movr0Inst) {
// Look backwards until we find a def that doesn't use the current EFLAGS.
Def = Sub;
- MachineBasicBlock::reverse_iterator
- InsertI = MachineBasicBlock::reverse_iterator(++Def),
- InsertE = Sub->getParent()->rend();
+ MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+ InsertE = Sub->getParent()->rend();
for (; InsertI != InsertE; ++InsertI) {
MachineInstr *Instr = &*InsertI;
if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
@@ -5455,14 +6521,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
- if (FoldAsLoadDefReg == 0)
- return nullptr;
- // To be conservative, if there exists another load, clear the load candidate.
- if (MI.mayLoad()) {
- FoldAsLoadDefReg = 0;
- return nullptr;
- }
-
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
@@ -5471,27 +6529,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
return nullptr;
// Collect information about virtual register operands of MI.
- unsigned SrcOperandId = 0;
- bool FoundSrcOperand = false;
- for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+ SmallVector<unsigned, 1> SrcOperandIds;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
- // Do not fold if we have a subreg use or a def or multiple uses.
- if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+ // Do not fold if we have a subreg use or a def.
+ if (MO.getSubReg() || MO.isDef())
return nullptr;
-
- SrcOperandId = i;
- FoundSrcOperand = true;
+ SrcOperandIds.push_back(i);
}
- if (!FoundSrcOperand)
+ if (SrcOperandIds.empty())
return nullptr;
// Check whether we can fold the def into SrcOperandId.
- if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
@@ -5553,7 +6608,9 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
return true;
}
-bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
+static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII,
+ const X86Subtarget &Subtarget) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
int64_t Imm = MIB->getOperand(1).getImm();
@@ -5570,23 +6627,23 @@ bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
X86MachineFunctionInfo *X86FI =
MBB.getParent()->getInfo<X86MachineFunctionInfo>();
if (X86FI->getUsesRedZone()) {
- MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri
- : X86::MOV64ri));
+ MIB->setDesc(TII.get(MIB->getOpcode() ==
+ X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
return true;
}
// 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
// widen the register if necessary.
StackAdjustment = 8;
- BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP64r));
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP64r));
MIB->getOperand(0)
.setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
} else {
assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
StackAdjustment = 4;
- BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP32r));
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP32r));
}
// Build CFI if necessary.
@@ -5616,7 +6673,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
unsigned Reg = MIB->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
- auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
@@ -5629,6 +6688,53 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
}
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &LoadDesc,
+ const MCInstrDesc &BroadcastDesc,
+ unsigned SubIdx) {
+ unsigned DestReg = MIB->getOperand(0).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(DestReg) < 16) {
+ // We can use a normal VEX encoded load.
+ MIB->setDesc(LoadDesc);
+ } else {
+ // Use a 128/256-bit VBROADCAST instruction.
+ MIB->setDesc(BroadcastDesc);
+ // Change the destination to a 512-bit register.
+ DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(DestReg);
+ }
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &StoreDesc,
+ const MCInstrDesc &ExtractDesc,
+ unsigned SubIdx) {
+ unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(SrcReg) < 16) {
+ // We can use a normal VEX encoded store.
+ MIB->setDesc(StoreDesc);
+ } else {
+ // Use a VEXTRACTF instruction.
+ MIB->setDesc(ExtractDesc);
+ // Change the destination to a 512-bit register.
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+ MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+ }
+
+ return true;
+}
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -5641,7 +6747,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
case X86::MOV32ImmSExti8:
case X86::MOV64ImmSExti8:
- return ExpandMOVImmSExti8(MIB);
+ return ExpandMOVImmSExti8(MIB, *this, Subtarget);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
@@ -5663,6 +6769,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
case X86::AVX512_512_SET0:
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
@@ -5676,6 +6785,45 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
+ case X86::AVX512_512_SEXT_MASK_32:
+ case X86::AVX512_512_SEXT_MASK_64: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ unsigned MaskReg = MIB->getOperand(1).getReg();
+ unsigned MaskState = getRegState(MIB->getOperand(1));
+ unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+ X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+ MI.RemoveOperand(1);
+ MIB->setDesc(get(Opc));
+ // VPTERNLOG needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
+ case X86::VMOVAPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVUPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVAPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVUPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVAPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVUPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVAPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::VMOVUPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::TEST8ri_NOREX:
MI.setDesc(get(X86::TEST8ri));
return true;
@@ -5801,6 +6949,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
switch (MI.getOpcode()) {
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
// Attempt to convert the load of inserted vector into a fold load
// of a single float.
if (OpNum == 2) {
@@ -5814,8 +6963,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
- (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
- : X86::INSERTPSrm);
+ (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
+ (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
+ X86::INSERTPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
@@ -5825,6 +6975,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
break;
case X86::MOVHLPSrr:
case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
// Move the upper 64-bits of the second operand to the lower 64-bits.
// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
@@ -5832,8 +6983,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
if (Size <= RCSize && 8 <= Align) {
unsigned NewOpCode =
- (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
- : X86::MOVLPSrm);
+ (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
+ (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
+ X86::MOVLPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
return NewMI;
@@ -6042,12 +7194,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::CVTSI2SD64rm:
case X86::CVTSD2SSrr:
case X86::CVTSD2SSrm:
- case X86::Int_CVTSD2SSrr:
- case X86::Int_CVTSD2SSrm:
case X86::CVTSS2SDrr:
case X86::CVTSS2SDrm:
- case X86::Int_CVTSS2SDrr:
- case X86::Int_CVTSS2SDrm:
case X86::MOVHPDrm:
case X86::MOVHPSrm:
case X86::MOVLPDrm:
@@ -6058,10 +7206,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::RCPSSm_Int:
case X86::ROUNDSDr:
case X86::ROUNDSDm:
- case X86::ROUNDSDr_Int:
case X86::ROUNDSSr:
case X86::ROUNDSSm:
- case X86::ROUNDSSr_Int:
case X86::RSQRTSSr:
case X86::RSQRTSSm:
case X86::RSQRTSSr_Int:
@@ -6134,28 +7280,95 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::Int_VCVTSS2SDrr:
case X86::Int_VCVTSS2SDrm:
case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
case X86::VRCPSSm:
case X86::VRCPSSm_Int:
case X86::VROUNDSDr:
case X86::VROUNDSDm:
case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
case X86::VROUNDSSr:
case X86::VROUNDSSm:
case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
case X86::VRSQRTSSm:
case X86::VRSQRTSSm_Int:
case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
case X86::VSQRTSSm:
case X86::VSQRTSSm_Int:
case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
case X86::VSQRTSDm:
case X86::VSQRTSDm_Int:
- // AVX-512
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrrb_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrrb:
case X86::VCVTSD2SSZrm:
case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrrb:
case X86::VCVTSS2SDZrm:
+ case X86::VRNDSCALESDr:
+ case X86::VRNDSCALESDrb:
+ case X86::VRNDSCALESDm:
+ case X86::VRNDSCALESSr:
+ case X86::VRNDSCALESSrb:
+ case X86::VRNDSCALESSm:
+ case X86::VRCP14SSrr:
+ case X86::VRCP14SSrm:
+ case X86::VRSQRT14SSrr:
+ case X86::VRSQRT14SSrm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
return true;
}
@@ -6233,9 +7446,17 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned Size = MFI->getObjectSize(FrameIndex);
- unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ // Don't fold subreg spills, or reloads that use a high subreg.
+ for (auto Op : Ops) {
+ MachineOperand &MO = MI.getOperand(Op);
+ auto SubReg = MO.getSubReg();
+ if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+ return nullptr;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Size = MFI.getObjectSize(FrameIndex);
+ unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
if (!RI.needsStackRealignment(MF))
@@ -6295,15 +7516,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SS).
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+ case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+ case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
+ case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
- case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
- case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
- case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
- case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
- case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
- case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
+ case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
+ case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
+ case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
+ case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
+ case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
+ case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
+ case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
+ case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
+ case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
+ case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
+ case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
+ case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
+ case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
+ case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
return false;
default:
return true;
@@ -6317,15 +7549,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SD).
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+ case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+ case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
+ case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
- case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
- case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
- case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
- case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
- case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
- case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
+ case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
+ case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
+ case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
+ case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
+ case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
+ case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
+ case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
+ case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
+ case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
+ case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
+ case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
+ case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
+ case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
+ case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
return false;
default:
return true;
@@ -6339,6 +7582,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
+
+ // TODO: Support the case where LoadMI loads a wide register, but MI
+ // only uses a subreg.
+ for (auto Op : Ops) {
+ if (MI.getOperand(Op).getSubReg())
+ return nullptr;
+ }
+
// If loading from a FrameIndex, fold directly from the FrameIndex.
unsigned NumOps = LoadMI.getDesc().getNumOperands();
int FrameIndex;
@@ -6376,9 +7627,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Alignment = 16;
break;
case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
Alignment = 8;
break;
case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS:
Alignment = 4;
break;
default:
@@ -6415,7 +7668,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SD:
- case X86::FsFLD0SS: {
+ case X86::AVX512_FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
// Create a constant-pool entry and operands to load from it.
@@ -6441,9 +7696,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineConstantPool &MCP = *MF.getConstantPool();
Type *Ty;
unsigned Opc = LoadMI.getOpcode();
- if (Opc == X86::FsFLD0SS)
+ if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
Ty = Type::getFloatTy(MF.getFunction()->getContext());
- else if (Opc == X86::FsFLD0SD)
+ else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
@@ -6649,7 +7904,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
@@ -6694,7 +7949,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
SDNode *Store =
@@ -6746,8 +8001,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -6757,8 +8010,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -6776,6 +8027,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -6786,6 +8039,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
@@ -6823,8 +8078,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -6834,8 +8087,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -6853,6 +8104,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -6863,6 +8116,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
@@ -6960,8 +8215,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
-bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
- MachineInstr &Second) const {
+bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const {
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
@@ -7120,7 +8375,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
}
bool X86InstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
Cond[0].setImm(GetOppositeBranchCondition(CC));
@@ -7168,7 +8423,10 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
- { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
+ { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
@@ -7184,7 +8442,10 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
- { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
+ { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
@@ -7200,7 +8461,26 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
- { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
+ // AVX512 support
+ { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
+ { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
+ { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
+ { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
+ { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
+ { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
+ { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
+ { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
};
static const uint16_t ReplaceableInstrsAVX2[][3] = {
@@ -7224,22 +8504,257 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
- { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
+ { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
+ { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
+};
+
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
+ { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
+ { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
+ { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
+ { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
+ { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
+ { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
+ { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
+ { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
+ { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
+ { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
+ { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
+ { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+ { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
+ { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
+ { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
+ { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
+ { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
+ { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
+ { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
+ { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble
+ //PackedInt PackedInt
+ { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
+ X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
+ { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
+ X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
+ { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
+ X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
+ X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
+ { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
+ X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
+ { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
+ X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
+ { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
+ X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
+ { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
+ X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
+ { X86::VORPSZ128rmk, X86::VORPDZ128rmk,
+ X86::VPORQZ128rmk, X86::VPORDZ128rmk },
+ { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
+ X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
+ { X86::VORPSZ128rrk, X86::VORPDZ128rrk,
+ X86::VPORQZ128rrk, X86::VPORDZ128rrk },
+ { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
+ X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
+ { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
+ X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
+ { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
+ X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
+ { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
+ X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
+ { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
+ X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
+ { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
+ X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
+ { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
+ X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
+ { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
+ X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
+ X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
+ { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
+ X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
+ { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
+ X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
+ { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
+ X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
+ { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
+ X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
+ { X86::VORPSZ256rmk, X86::VORPDZ256rmk,
+ X86::VPORQZ256rmk, X86::VPORDZ256rmk },
+ { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
+ X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
+ { X86::VORPSZ256rrk, X86::VORPDZ256rrk,
+ X86::VPORQZ256rrk, X86::VPORDZ256rrk },
+ { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
+ X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
+ { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
+ X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
+ { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
+ X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
+ { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
+ X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
+ { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
+ X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
+ { X86::VANDNPSZrmk, X86::VANDNPDZrmk,
+ X86::VPANDNQZrmk, X86::VPANDNDZrmk },
+ { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
+ X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
+ { X86::VANDNPSZrrk, X86::VANDNPDZrrk,
+ X86::VPANDNQZrrk, X86::VPANDNDZrrk },
+ { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
+ X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
+ { X86::VANDPSZrmk, X86::VANDPDZrmk,
+ X86::VPANDQZrmk, X86::VPANDDZrmk },
+ { X86::VANDPSZrmkz, X86::VANDPDZrmkz,
+ X86::VPANDQZrmkz, X86::VPANDDZrmkz },
+ { X86::VANDPSZrrk, X86::VANDPDZrrk,
+ X86::VPANDQZrrk, X86::VPANDDZrrk },
+ { X86::VANDPSZrrkz, X86::VANDPDZrrkz,
+ X86::VPANDQZrrkz, X86::VPANDDZrrkz },
+ { X86::VORPSZrmk, X86::VORPDZrmk,
+ X86::VPORQZrmk, X86::VPORDZrmk },
+ { X86::VORPSZrmkz, X86::VORPDZrmkz,
+ X86::VPORQZrmkz, X86::VPORDZrmkz },
+ { X86::VORPSZrrk, X86::VORPDZrrk,
+ X86::VPORQZrrk, X86::VPORDZrrk },
+ { X86::VORPSZrrkz, X86::VORPDZrrkz,
+ X86::VPORQZrrkz, X86::VPORDZrrkz },
+ { X86::VXORPSZrmk, X86::VXORPDZrmk,
+ X86::VPXORQZrmk, X86::VPXORDZrmk },
+ { X86::VXORPSZrmkz, X86::VXORPDZrmkz,
+ X86::VPXORQZrmkz, X86::VPXORDZrmkz },
+ { X86::VXORPSZrrk, X86::VXORPDZrrk,
+ X86::VPXORQZrrk, X86::VPXORDZrrk },
+ { X86::VXORPSZrrkz, X86::VXORPDZrrkz,
+ X86::VPXORQZrrkz, X86::VPXORDZrrkz },
+ // Broadcast loads can be handled the same as masked operations to avoid
+ // changing element size.
+ { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
+ X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
+ { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
+ X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
+ { X86::VORPSZ128rmb, X86::VORPDZ128rmb,
+ X86::VPORQZ128rmb, X86::VPORDZ128rmb },
+ { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
+ X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
+ { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
+ X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
+ { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
+ X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
+ { X86::VORPSZ256rmb, X86::VORPDZ256rmb,
+ X86::VPORQZ256rmb, X86::VPORDZ256rmb },
+ { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
+ X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
+ { X86::VANDNPSZrmb, X86::VANDNPDZrmb,
+ X86::VPANDNQZrmb, X86::VPANDNDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VORPSZrmb, X86::VORPDZrmb,
+ X86::VPORQZrmb, X86::VPORDZrmb },
+ { X86::VXORPSZrmb, X86::VXORPDZrmb,
+ X86::VPXORQZrmb, X86::VPXORDZrmb },
+ { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
+ X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
+ { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
+ X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
+ { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
+ X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
+ { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
+ X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
+ { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
+ X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
+ { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
+ X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
+ { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
+ X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
+ { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
+ X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
+ { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
+ X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VORPSZrmbk, X86::VORPDZrmbk,
+ X86::VPORQZrmbk, X86::VPORDZrmbk },
+ { X86::VXORPSZrmbk, X86::VXORPDZrmbk,
+ X86::VPXORQZrmbk, X86::VPXORDZrmbk },
+ { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
+ X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
+ { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
+ X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
+ { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
+ X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
+ { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
+ X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
+ { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
+ X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
+ { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
+ X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
+ { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
+ X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
+ { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
+ X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
+ { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
+ X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VORPSZrmbkz, X86::VORPDZrmbkz,
+ X86::VPORQZrmbkz, X86::VPORDZrmbkz },
+ { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
+ X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
};
// FIXME: Some shuffle and unpack instructions have equivalents in different
// domains, but they require a bit more work than just switching opcodes.
-static const uint16_t *lookup(unsigned opcode, unsigned domain) {
- for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+static const uint16_t *lookup(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[3]> Table) {
+ for (const uint16_t (&Row)[3] : Table)
if (Row[domain-1] == opcode)
return Row;
return nullptr;
}
-static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
- for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
- if (Row[domain-1] == opcode)
+static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[4]> Table) {
+ // If this is the integer domain make sure to check both integer columns.
+ for (const uint16_t (&Row)[4] : Table)
+ if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
return Row;
return nullptr;
}
@@ -7247,12 +8762,25 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
std::pair<uint16_t, uint16_t>
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
- bool hasAVX2 = Subtarget.hasAVX2();
+ unsigned opcode = MI.getOpcode();
uint16_t validDomains = 0;
- if (domain && lookup(MI.getOpcode(), domain))
- validDomains = 0xe;
- else if (domain && lookupAVX2(MI.getOpcode(), domain))
- validDomains = hasAVX2 ? 0xe : 0x6;
+ if (domain) {
+ if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+ validDomains = 0xe;
+ } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
+ validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
+ validDomains = 0xe;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
+ validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
+ } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+ ReplaceableInstrsAVX512DQMasked)) {
+ if (domain == 1 || (domain == 3 && table[3] == opcode))
+ validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
+ else
+ validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+ }
+ }
return std::make_pair(domain, validDomains);
}
@@ -7260,11 +8788,32 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
assert(Domain>0 && Domain<4 && "Invalid execution domain");
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
assert(dom && "Not an SSE instruction");
- const uint16_t *table = lookup(MI.getOpcode(), dom);
+ const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
if (!table) { // try the other table
assert((Subtarget.hasAVX2() || Domain < 3) &&
"256-bit vector operations only available in AVX2");
- table = lookupAVX2(MI.getOpcode(), dom);
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
+ }
+ if (!table) { // try the AVX512 table
+ assert(Subtarget.hasAVX512() && "Requires AVX-512");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
+ // Don't change integer Q instructions to D instructions.
+ if (table && Domain == 3 && table[3] == MI.getOpcode())
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQ table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
+ // Don't change integer Q instructions to D instructions and
+ // use D intructions if we started with a PS instruction.
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQMasked table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
}
assert(table && "Cannot change domain");
MI.setDesc(get(table[Domain - 1]));
@@ -7275,32 +8824,6 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(X86::NOOP);
}
-// This code must remain in sync with getJumpInstrTableEntryBound in this class!
-// In particular, getJumpInstrTableEntryBound must always return an upper bound
-// on the encoding lengths of the instructions generated by
-// getUnconditionalBranch and getTrap.
-void X86InstrInfo::getUnconditionalBranch(
- MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
- Branch.setOpcode(X86::JMP_1);
- Branch.addOperand(MCOperand::createExpr(BranchTarget));
-}
-
-// This code must remain in sync with getJumpInstrTableEntryBound in this class!
-// In particular, getJumpInstrTableEntryBound must always return an upper bound
-// on the encoding lengths of the instructions generated by
-// getUnconditionalBranch and getTrap.
-void X86InstrInfo::getTrap(MCInst &MI) const {
- MI.setOpcode(X86::TRAP);
-}
-
-// See getTrap and getUnconditionalBranch for conditions on the value returned
-// by this function.
-unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
- // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
- // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
- return 5;
-}
-
bool X86InstrInfo::isHighLatencyDef(int opc) const {
switch (opc) {
default: return false;
@@ -7934,6 +9457,28 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
return makeArrayRef(TargetFlags);
}
+bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::TCRETURNdi:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNmi64:
+ case X86::TCRETURNri64:
+ case X86::TAILJMPd:
+ case X86::TAILJMPm:
+ case X86::TAILJMPr:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPr64_REX:
+ return true;
+ default:
+ return false;
+ }
+}
+
namespace {
/// Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32.
@@ -7991,7 +9536,7 @@ namespace {
return true;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "X86 PIC Global Base Reg Initialization";
}
@@ -8105,7 +9650,7 @@ namespace {
return Copy;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "Local Dynamic TLS Access Clean-up";
}
OpenPOWER on IntegriCloud