diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86InstrSSE.td')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86InstrSSE.td | 1855 |
1 files changed, 1255 insertions, 600 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index a5debc0..2bb898e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -120,6 +120,11 @@ def SSE_DIV_ITINS_P : SizeItins< SSE_DIV_F32P, SSE_DIV_F64P >; +let Sched = WriteVecLogic in +def SSE_VEC_BIT_ITINS_P : OpndItins< + IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM +>; + def SSE_BIT_ITINS_P : OpndItins< IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM >; @@ -171,6 +176,7 @@ def SSE_INSERT_ITINS : OpndItins< IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM >; +let Sched = WriteMPSAD in def SSE_MPSADBW_ITINS : OpndItins< IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; @@ -179,6 +185,44 @@ def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; +// Definitions for backward compatibility. +// The instructions mapped on these definitions uses a different itinerary +// than the actual scheduling model. +let Sched = WriteShuffle in +def DEFAULT_ITINS_SHUFFLESCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteVecIMul in +def DEFAULT_ITINS_VECIMULSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteShuffle in +def SSE_INTALU_ITINS_SHUFF_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +let Sched = WriteMPSAD in +def DEFAULT_ITINS_MPSADSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteFBlend in +def DEFAULT_ITINS_FBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteBlend in +def DEFAULT_ITINS_BLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteFBlend in +def SSE_INTALU_ITINS_FBLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -210,6 +254,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, Operand memopr, ComplexPattern mem_cpat, OpndItins itins, bit Is2Addr = 1> { +let isCodeGenOnly = 1 in { def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), @@ -227,6 +272,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, RC:$src1, mem_cpat:$src2))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +} /// sse12_fp_packed - SSE 1 & 2 packed instructions class multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -497,14 +543,14 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, !strconcat(base_opc, asm_opr), [(set VR128:$dst, (vt (OpNode VR128:$src1, (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>; + IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; // For the disassembler - let isCodeGenOnly = 1, hasSideEffects = 0 in + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), - [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>; + [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; } multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, @@ -800,7 +846,7 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, let neverHasSideEffects = 1 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, - Sched<[WriteMove]>; + Sched<[WriteFShuffle]>; let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), @@ -810,41 +856,41 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - TB, VEX; + PS, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - TB, OpSize, VEX; + PD, VEX; defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - TB, VEX; + PS, VEX; defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, - TB, OpSize, VEX; + PD, VEX; defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - TB, VEX, VEX_L; + PS, VEX, VEX_L; defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - TB, OpSize, VEX, VEX_L; + PD, VEX, VEX_L; defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - TB, VEX, VEX_L; + PS, VEX, VEX_L; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, - TB, OpSize, VEX, VEX_L; + PD, VEX, VEX_L; defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - TB; + PS; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - TB, OpSize; + PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - TB; + PS; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, - TB, OpSize; + PD; let SchedRW = [WriteStore] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), @@ -882,7 +928,8 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), } // SchedRW // For disassembler -let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteFShuffle] in { def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], @@ -958,7 +1005,8 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), } // SchedRW // For disassembler -let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1138,16 +1186,16 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, [(set VR128:$dst, (psnode VR128:$src1, (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], - itin, SSEPackedSingle>, TB, - Sched<[WriteShuffleLd, ReadAfterLd]>; + itin, SSEPackedSingle>, PS, + Sched<[WriteFShuffleLd, ReadAfterLd]>; def PDrm : PI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), !strconcat(base_opc, "d", asm_opr), [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))))], - itin, SSEPackedDouble>, TB, OpSize, - Sched<[WriteShuffleLd, ReadAfterLd]>; + itin, SSEPackedDouble>, PD, + Sched<[WriteFShuffleLd, ReadAfterLd]>; } @@ -1345,14 +1393,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in { [(set VR128:$dst, (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_LH>, - VEX_4V, Sched<[WriteShuffle]>; + VEX_4V, Sched<[WriteFShuffle]>; def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_LH>, - VEX_4V, Sched<[WriteShuffle]>; + VEX_4V, Sched<[WriteFShuffle]>; } let Constraints = "$src1 = $dst", AddedComplexity = 20 in { def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), @@ -1360,13 +1408,13 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { "movlhps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], - IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], - IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; } let Predicates = [UseAVX] in { @@ -1513,9 +1561,9 @@ defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, let Predicates = [UseAVX] in { def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>; + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; + (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -1579,9 +1627,9 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", - (CVTSI2SSrm FR64:$dst, i32mem:$src)>; + (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>; def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", - (CVTSI2SDrm FR64:$dst, i32mem:$src)>; + (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>; // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). @@ -1632,40 +1680,43 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; -let Predicates = [UseAVX] in { -defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", - SSE_CVT_Scalar, 0>, XS, VEX_4V; -defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", - SSE_CVT_Scalar, 0>, XS, VEX_4V, - VEX_W; -defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", - SSE_CVT_Scalar, 0>, XD, VEX_4V; -defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", - SSE_CVT_Scalar, 0>, XD, - VEX_4V, VEX_W; -} -let Constraints = "$src1 = $dst" in { - defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, - "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; - defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, - "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; - defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, - "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; - defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, - "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; -} +let isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in { + defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", + SSE_CVT_Scalar, 0>, XS, VEX_4V; + defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", + SSE_CVT_Scalar, 0>, XS, VEX_4V, + VEX_W; + defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", + SSE_CVT_Scalar, 0>, XD, VEX_4V; + defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", + SSE_CVT_Scalar, 0>, XD, + VEX_4V, VEX_W; + } + let Constraints = "$src1 = $dst" in { + defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, + "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; + defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, + "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; + defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, + "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; + defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, + "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; + } +} // isCodeGenOnly = 1 /// SSE 1 Only // Aliases for intrinsics +let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, ssmem, sse_load_f32, "cvttss2si", @@ -1694,6 +1745,7 @@ defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; +} // isCodeGenOnly = 1 let Predicates = [UseAVX] in { defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, @@ -1713,16 +1765,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, VEX, Requires<[HasAVX]>; + PS, VEX, Requires<[HasAVX]>; defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, VEX, VEX_L, Requires<[HasAVX]>; + PS, VEX, VEX_L, Requires<[HasAVX]>; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, Requires<[UseSSE2]>; + PS, Requires<[UseSSE2]>; let Predicates = [UseAVX] in { def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1792,6 +1844,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), XD, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; +let isCodeGenOnly = 1 in { def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1823,6 +1876,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } +} // isCodeGenOnly = 1 // Convert scalar single to scalar double // SSE2 instructions with XS prefix @@ -1875,6 +1929,7 @@ def : Pat<(fextend (loadf32 addr:$src)), def : Pat<(extloadf32 addr:$src), (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; +let isCodeGenOnly = 1 in { def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1905,6 +1960,7 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } +} // isCodeGenOnly = 1 // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -1949,7 +2005,7 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrr VR128:$dst, VR128:$src)>; + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1968,7 +2024,7 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; } def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -2071,7 +2127,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrr VR128:$dst, VR128:$src)>; + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq @@ -2090,7 +2146,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; let Predicates = [HasAVX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), @@ -2116,32 +2172,32 @@ let Predicates = [HasAVX] in { def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; } let Predicates = [UseSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>; } // Convert Packed DW Integers to Packed Double FP @@ -2196,7 +2252,7 @@ def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrr VR128:$dst, VR128:$src)>; + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2psx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2215,7 +2271,7 @@ def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", @@ -2287,7 +2343,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. - let neverHasSideEffects = 1 in { + let isAsmParserOnly = 1, hasSideEffects = 0 in { def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; @@ -2299,23 +2355,23 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, } } -defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32, +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG; -defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64, +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSE_ALU_F32S>, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32, + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, XS; - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64, + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F64S>, @@ -2338,23 +2394,25 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, Sched<[itins.Sched.Folded, ReadAfterLd]>; } -// Aliases to match intrinsics which expect XMM operand(s). -defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, - XS, VEX_4V; -defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, // same latency as f32 - XD, VEX_4V; -let Constraints = "$src1 = $dst" in { - defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S>, XS; - defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F64S>, - XD; +let isCodeGenOnly = 1 in { + // Aliases to match intrinsics which expect XMM operand(s). + defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, + XS, VEX_4V; + defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, // same latency as f32 + XD, VEX_4V; + let Constraints = "$src1 = $dst" in { + defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S>, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}", + SSE_ALU_F64S>, + XD; +} } @@ -2377,46 +2435,50 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, TB, VEX, VEX_LIG; + "ucomiss">, PS, VEX, VEX_LIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, TB, OpSize, VEX, VEX_LIG; + "ucomisd">, PD, VEX, VEX_LIG; let Pattern = []<dag> in { defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss">, TB, VEX, VEX_LIG; + "comiss">, PS, VEX, VEX_LIG; defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd">, TB, OpSize, VEX, VEX_LIG; + "comisd">, PD, VEX, VEX_LIG; } - defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss">, TB, VEX; - defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd">, TB, OpSize, VEX; + let isCodeGenOnly = 1 in { + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss">, PS, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd">, PD, VEX; - defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss">, TB, VEX; - defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd">, TB, OpSize, VEX; + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss">, PS, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd">, PD, VEX; + } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, TB; + "ucomiss">, PS; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, TB, OpSize; + "ucomisd">, PD; let Pattern = []<dag> in { defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss">, TB; + "comiss">, PS; defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd">, TB, OpSize; + "comisd">, PD; } - defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss">, TB; - defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd">, TB, OpSize; + let isCodeGenOnly = 1 in { + defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss">, PS; + defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd">, PD; - defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, - "comiss">, TB; - defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, - "comisd">, TB, OpSize; + defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, + "comiss">, PS; + defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, + "comisd">, PD; + } } // Defs = [EFLAGS] // sse12_cmp_packed - sse 1 & 2 compare packed instructions @@ -2436,7 +2498,7 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Sched<[WriteFAddLd, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. - let neverHasSideEffects = 1 in { + let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; @@ -2450,28 +2512,28 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, PS, VEX_4V; defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, PD, VEX_4V; defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, TB, VEX_4V, VEX_L; + SSEPackedSingle>, PS, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + SSEPackedDouble>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle, SSE_ALU_F32P>, TB; + SSEPackedSingle, SSE_ALU_F32P>, PS; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize; + SSEPackedDouble, SSE_ALU_F64P>, PD; } let Predicates = [HasAVX] in { @@ -2512,7 +2574,7 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), // SSE 1 & 2 - Shuffle Instructions //===----------------------------------------------------------------------===// -/// sse12_shuffle - sse 1 & 2 shuffle instructions +/// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, Domain d, bit IsConvertibleToThreeAddress = 0> { @@ -2520,37 +2582,35 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + Sched<[WriteFShuffleLd, ReadAfterLd]>; let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteShuffle]>; + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f32, SSEPackedSingle>, TB, VEX_4V; + loadv4f32, SSEPackedSingle>, PS, VEX_4V; defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L; + loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; + loadv2f64, SSEPackedDouble>, PD, VEX_4V; defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, - TB; + memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, - TB, OpSize; + memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD; } let Predicates = [HasAVX] in { @@ -2598,10 +2658,10 @@ let Predicates = [UseSSE2] in { } //===----------------------------------------------------------------------===// -// SSE 1 & 2 - Unpack Instructions +// SSE 1 & 2 - Unpack FP Instructions //===----------------------------------------------------------------------===// -/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave +/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, PatFrag mem_frag, RegisterClass RC, X86MemOperand x86memop, string asm, @@ -2610,55 +2670,55 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, (outs RC:$dst), (ins RC:$src1, RC:$src2), asm, [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], - IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>; + IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>; def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), asm, [(set RC:$dst, (vt (OpNode RC:$src1, (mem_frag addr:$src2))))], IIC_SSE_UNPCK, d>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + Sched<[WriteFShuffleLd, ReadAfterLd]>; } defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, PS, VEX_4V; defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, PD, VEX_4V; defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, PS, VEX_4V; defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, PD, VEX_4V; defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V, VEX_L; + SSEPackedSingle>, PS, VEX_4V, VEX_L; defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + SSEPackedDouble>, PD, VEX_4V, VEX_L; defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V, VEX_L; + SSEPackedSingle>, PS, VEX_4V, VEX_L; defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + SSEPackedDouble>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, TB; + SSEPackedSingle>, PS; defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, TB, OpSize; + SSEPackedDouble>, PD; defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, TB; + SSEPackedSingle>, PS; defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, TB, OpSize; + SSEPackedDouble>, PD; } // Constraints = "$src1 = $dst" let Predicates = [HasAVX1Only] in { @@ -2714,16 +2774,15 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, let Predicates = [HasAVX] in { defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, - "movmskps", SSEPackedSingle>, TB, VEX; + "movmskps", SSEPackedSingle>, PS, VEX; defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, - "movmskpd", SSEPackedDouble>, TB, - OpSize, VEX; + "movmskpd", SSEPackedDouble>, PD, VEX; defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, - "movmskps", SSEPackedSingle>, TB, + "movmskps", SSEPackedSingle>, PS, VEX, VEX_L; defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, - "movmskpd", SSEPackedDouble>, TB, - OpSize, VEX, VEX_L; + "movmskpd", SSEPackedDouble>, PD, + VEX, VEX_L; def : Pat<(i32 (X86fgetsign FR32:$src)), (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -2738,9 +2797,9 @@ let Predicates = [HasAVX] in { } defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", - SSEPackedSingle>, TB; + SSEPackedSingle>, PS; defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", - SSEPackedDouble>, TB, OpSize; + SSEPackedDouble>, PD; def : Pat<(i32 (X86fgetsign FR32:$src)), (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>, @@ -2807,11 +2866,14 @@ let Predicates = [HasAVX2] in // These are ordered here for pattern ordering requirements with the fp versions -defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; -defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; -defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1>; +defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1>; +defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1>; defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, - SSE_BIT_ITINS_P, 0>; + SSE_VEC_BIT_ITINS_P, 0>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2823,20 +2885,20 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, - TB, VEX_4V; + PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, - TB, OpSize, VEX_4V; + PD, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins>, - TB; + PS; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins>, - TB, OpSize; + PD; } } @@ -2862,7 +2924,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "ps"), f256mem, [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L; + (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, @@ -2870,7 +2932,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, (bc_v4i64 (v4f64 VR256:$src2))))], [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), (loadv4i64 addr:$src2)))], 0>, - TB, OpSize, VEX_4V, VEX_L; + PD, VEX_4V, VEX_L; // In AVX no need to add a pattern for 128-bit logical rr ps, because they // are all promoted to v2i64, and the patterns are covered by the int @@ -2879,7 +2941,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V; + (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, @@ -2887,21 +2949,21 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, (bc_v2i64 (v2f64 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)))], 0>, - TB, OpSize, VEX_4V; + PD, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (memopv2i64 addr:$src2)))]>, TB; + (memopv2i64 addr:$src2)))]>, PS; defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (bc_v2i64 (v2f64 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (memopv2i64 addr:$src2)))]>, TB, OpSize; + (memopv2i64 addr:$src2)))]>, PD; } } @@ -2911,6 +2973,19 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; +// AVX1 requires type coercions in order to fold loads directly into logical +// operations. +let Predicates = [HasAVX1Only] in { + def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), + (VANDNPSYrm VR256:$src1, addr:$src2)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Arithmetic Instructions //===----------------------------------------------------------------------===// @@ -2932,25 +3007,25 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, - SSEPackedSingle, itins.s, 0>, TB, VEX_4V; + SSEPackedSingle, itins.s, 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, loadv2f64, - SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V; + SSEPackedDouble, itins.d, 0>, PD, VEX_4V; defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, loadv8f32, - SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L; + SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L; defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, - SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L; + SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, memopv4f32, SSEPackedSingle, - itins.s>, TB; + itins.s>, PS; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, memopv2f64, SSEPackedDouble, - itins.d>, TB, OpSize; + itins.d>, PD; } } @@ -3017,6 +3092,214 @@ let isCodeGenOnly = 1 in { basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; } +// Patterns used to select SSE scalar fp arithmetic instructions from +// a scalar fp operation followed by a blend. +// +// These patterns know, for example, how to select an ADDSS from a +// float add plus vector insert. +// +// The effect is that the backend no longer emits unnecessary vector +// insert instructions immediately after SSE scalar fp instructions +// like addss or mulss. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// A[0] += B[0]; +// return A; +// } +// +// previously we generated: +// addss %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// we now generate: +// addss %xmm1, %xmm0 + +let Predicates = [UseSSE1] in { + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; +} + +let Predicates = [UseSSE2] in { + // SSE2 patterns to select scalar double-precision fp arithmetic instructions + + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; +} + +let Predicates = [UseSSE41] in { + // If the subtarget has SSE4.1 but not AVX, the vector insert + // instruction is lowered into a X86insertps rather than a X86Movss. + // When selecting SSE scalar single-precision fp arithmetic instructions, + // make sure that we correctly match the X86insertps. + + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; +} + +let Predicates = [HasAVX] in { + // The following patterns select AVX Scalar single/double precision fp + // arithmetic instructions. + + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; +} + +// Patterns used to select SSE scalar fp arithmetic instructions from +// a vector packed single/double fp operation followed by a vector insert. +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// we now generate: +// addss %xmm1, %xmm0 + +let Predicates = [UseSSE1] in { + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; +} + +let Predicates = [UseSSE2] in { + // SSE2 patterns to select scalar double-precision fp arithmetic instructions + // from a packed double-precision fp instruction plus movsd. + + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + +let Predicates = [HasAVX] in { + // The following patterns select AVX Scalar single/double precision fp + // arithmetic instructions from a packed single precision fp instruction + // plus movss/movsd. + + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -3069,6 +3352,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, VEX_4V, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), !strconcat("v", OpcodeStr, @@ -3089,6 +3373,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; +let isCodeGenOnly = 1 in { def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>, @@ -3098,6 +3383,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>, Sched<[itins.Sched.Folded]>; } +} /// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3115,6 +3401,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, VEX_4V, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), !strconcat("v", OpcodeStr, @@ -3135,7 +3422,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; - let Constraints = "$src1 = $dst" in { + let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), @@ -3188,6 +3475,7 @@ let Predicates = [HasAVX] in { multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, Intrinsic V4F32Int, Intrinsic V8F32Int, OpndItins itins> { +let isCodeGenOnly = 1 in { let Predicates = [HasAVX] in { def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, @@ -3220,6 +3508,7 @@ let Predicates = [HasAVX] in { !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], itins.rm>, Sched<[itins.Sched.Folded]>; +} // isCodeGenOnly = 1 } /// sse2_fp_unop_s - SSE2 unops in scalar form. @@ -3238,6 +3527,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, VEX_4V, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), !strconcat("v", OpcodeStr, @@ -3256,6 +3546,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; +let isCodeGenOnly = 1 in { def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>, @@ -3265,6 +3556,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in { [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>, Sched<[itins.Sched.Folded]>; } +} /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, @@ -3455,19 +3747,14 @@ def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti{l}\t{$src, $dst|$dst, $src}", [(nontemporalstore (i32 GR32:$src), addr:$dst)], IIC_SSE_MOVNT>, - TB, Requires<[HasSSE2]>; + PS, Requires<[HasSSE2]>; def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti{q}\t{$src, $dst|$dst, $src}", [(nontemporalstore (i64 GR64:$src), addr:$dst)], IIC_SSE_MOVNT>, - TB, Requires<[HasSSE2]>; + PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] -def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), - (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; - -def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), - (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>; } // AddedComplexity //===----------------------------------------------------------------------===// @@ -3490,17 +3777,23 @@ def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), IIC_SSE_PREFETCH>, TB; } -// FIXME: How should these memory instructions be modeled? +// FIXME: How should flush instruction be modeled? let SchedRW = [WriteLoad] in { // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; +} +let SchedRW = [WriteNop] in { // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. -def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP; +def PAUSE : I<0x90, RawFrm, (outs), (ins), + "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, + OBXS, Requires<[HasSSE2]>; +} +let SchedRW = [WriteFence] in { // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, @@ -3557,7 +3850,8 @@ def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), } // For Disassembler -let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove] in { def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, @@ -3621,7 +3915,7 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; // For Disassembler -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -3787,6 +4081,10 @@ defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, SSE_INTALUQ_ITINS_P, 1>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SSE_INTMUL_ITINS_P, 1>; +defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; +defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, SSE_INTALU_ITINS_P, 0>; defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, @@ -3821,10 +4119,6 @@ defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; -defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, - int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>; -defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, - int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, @@ -4043,17 +4337,6 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, SSE_INTALU_ITINS_P, 0>; //===---------------------------------------------------------------------===// -// SSE2 - Packed Integer Pack Instructions -//===---------------------------------------------------------------------===// - -defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>; -defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>; -defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>; - -//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// @@ -4111,12 +4394,12 @@ let Predicates = [UseSSE2] in { [(set VR128:$dst, (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, - Sched<[WriteShuffleLd]>; + Sched<[WriteShuffleLd, ReadAfterLd]>; } } } // ExeDomain = SSEPackedInt -defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize; +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD; defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; @@ -4135,6 +4418,136 @@ let Predicates = [UseSSE2] in { } //===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : SS48I<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : SS48I<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : SS48I<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// @@ -4269,7 +4682,7 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), - imm:$src2))]>, TB, OpSize, VEX, + imm:$src2))]>, PD, VEX, Sched<[WriteShuffle]>; def PEXTRWri : PDIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4280,10 +4693,10 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, // Insert let Predicates = [HasAVX] in -defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; +defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in -defm PINSRW : sse2_pinsrw, TB, OpSize; +defm PINSRW : sse2_pinsrw, PD; } // ExeDomain = SSEPackedInt @@ -4320,7 +4733,7 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { -let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in +let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", @@ -4333,7 +4746,7 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], IIC_SSE_MASKMOV>, VEX; -let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in +let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], @@ -4434,7 +4847,7 @@ def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, - VEX, Sched<[WriteLoad]>; + VEX, Sched<[WriteStore]>; def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -4444,7 +4857,7 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], - IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; @@ -4638,17 +5051,24 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), IIC_SSE_MOVDQ>; } // SchedRW +// For disassembler only +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteVecLogic] in { +def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX; +def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>; +} + //===---------------------------------------------------------------------===// // Store / copy lower 64-bits of a XMM register. // -def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), - "movq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX, - Sched<[WriteStore]>; -def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), - "movq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)], - IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +let Predicates = [UseAVX] in +def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; +let Predicates = [UseSSE2] in +def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), + (MOVPQI2QImr addr:$dst, VR128:$src)>; let isCodeGenOnly = 1, AddedComplexity = 20 in { def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), @@ -4745,11 +5165,11 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (vt (OpNode RC:$src)))], - IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (OpNode (mem_frag addr:$src)))], - IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; + IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } let Predicates = [HasAVX] in { @@ -4805,13 +5225,13 @@ multiclass sse3_replicate_dfp<string OpcodeStr> { let neverHasSideEffects = 1 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; + [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (v2f64 (X86Movddup (scalar_to_vector (loadf64 addr:$src)))))], - IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; + IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } // FIXME: Merge with above classe when there're patterns for the ymm version @@ -4819,13 +5239,13 @@ multiclass sse3_replicate_dfp_y<string OpcodeStr> { def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, - Sched<[WriteShuffle]>; + Sched<[WriteFShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (v4f64 (X86Movddup (scalar_to_vector (loadf64 addr:$src)))))]>, - Sched<[WriteShuffleLd]>; + Sched<[WriteLoad]>; } let Predicates = [HasAVX] in { @@ -4915,24 +5335,78 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; + f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; + f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L; + f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem, SSE_ALU_F32P>, TB, XD; + f128mem, SSE_ALU_F32P>, XD; let ExeDomain = SSEPackedDouble in defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, - f128mem, SSE_ALU_F64P>, TB, OpSize; + f128mem, SSE_ALU_F64P>, PD; +} + +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + // Constant 170 corresponds to the binary mask '10101010'. + // When used as a blend mask, it allows selecting eight elements from two + // input vectors as follow: + // - Even-numbered values in the destination are copied from + // the corresponding elements in the first input vector; + // - Odd-numbered values in the destination are copied from + // the corresponding elements in the second input vector. + + def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), + (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + + // Constant 10 corresponds to the binary mask '1010'. + // In the two pattens below, constant 10 is used as a blend mask to select + // - the 1st and 3rd element from the first input vector (the 'fsub' node); + // - the 2nd and 4th element from the second input vector (the 'fadd' node). + + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + +let Predicates = [UseSSE3] in { + // Constant 10 corresponds to the binary mask '1010'. + // In the pattern below, it is used as a blend mask to select: + // - the 1st and 3rd element from the first input vector (the fsub node); + // - the 2nd and 4th element from the second input vector (the fadd node). + + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; } //===---------------------------------------------------------------------===// @@ -5019,7 +5493,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, - OpSize, Sched<[WriteVecALU]>; + Sched<[WriteVecALU]>; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -5027,7 +5501,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (IntId128 (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, - OpSize, Sched<[WriteVecALULd]>; + Sched<[WriteVecALULd]>; } /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. @@ -5037,14 +5511,14 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 VR256:$src))]>, - OpSize, Sched<[WriteVecALU]>; + Sched<[WriteVecALU]>; def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 - (bitconvert (memopv4i64 addr:$src))))]>, OpSize, + (bitconvert (memopv4i64 addr:$src))))]>, Sched<[WriteVecALULd]>; } @@ -5164,7 +5638,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, - OpSize, Sched<[itins.Sched]>; + Sched<[itins.Sched]>; def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, @@ -5172,7 +5646,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize, + (bitconvert (memop_frag addr:$src2)))))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -5187,7 +5661,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, - OpSize, Sched<[itins.Sched]>; + Sched<[itins.Sched]>; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, @@ -5195,24 +5669,25 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>, OpSize, + (bitconvert (memopv2i64 addr:$src2))))]>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId256> { + Intrinsic IntId256, + X86FoldableSchedWrite Sched> { let isCommutable = 1 in def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, - OpSize; + Sched<[Sched]>; def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (IntId256 VR256:$src1, - (bitconvert (loadv4i64 addr:$src2))))]>, OpSize; + (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, + Sched<[Sched.Folded, ReadAfterLd]>; } let ImmT = NoImm, Predicates = [HasAVX] in { @@ -5281,16 +5756,20 @@ let isCommutable = 0 in { SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, loadv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + SSE_PSHUFB, 0>, VEX_4V, VEX_L; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", - int_x86_avx2_phadd_sw>, VEX_4V, VEX_L; + int_x86_avx2_phadd_sw, + WriteVecALU>, VEX_4V, VEX_L; defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", - int_x86_avx2_phsub_sw>, VEX_4V, VEX_L; + int_x86_avx2_phsub_sw, + WriteVecALU>, VEX_4V, VEX_L; defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", - int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L; + int_x86_avx2_pmadd_ub_sw, + WriteVecIMul>, VEX_4V, VEX_L; } defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", - int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L; + int_x86_avx2_pmul_hr_sw, + WriteVecIMul>, VEX_4V, VEX_L; } // None of these have i8 immediate fields. @@ -5338,7 +5817,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>; + [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), @@ -5346,7 +5825,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; + [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>; } } @@ -5356,13 +5835,13 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { (ins VR256:$src1, VR256:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize, Sched<[WriteShuffle]>; + []>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; + []>, Sched<[WriteShuffleLd, ReadAfterLd]>; } } @@ -5426,11 +5905,11 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", TB, Requires<[HasSSE3]>; } // SchedRW -def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, - Requires<[In32BitMode]>; + Requires<[Not64BitMode]>; def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, Requires<[In64BitMode]>; @@ -5442,63 +5921,82 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId, OpndItins itins = DEFAULT_ITINS> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize; + [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, + Sched<[itins.Sched]>; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], - itins.rm>, OpSize; + itins.rm>, Sched<[itins.Sched.Folded]>; } multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId> { + Intrinsic IntId, X86FoldableSchedWrite Sched> { def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId (load addr:$src)))]>, - OpSize; + Sched<[Sched.Folded]>; } let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", - int_x86_sse41_pmovsxbw>, VEX; + int_x86_sse41_pmovsxbw, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", - int_x86_sse41_pmovsxwd>, VEX; + int_x86_sse41_pmovsxwd, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", - int_x86_sse41_pmovsxdq>, VEX; + int_x86_sse41_pmovsxdq, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", - int_x86_sse41_pmovzxbw>, VEX; + int_x86_sse41_pmovzxbw, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", - int_x86_sse41_pmovzxwd>, VEX; + int_x86_sse41_pmovzxwd, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", - int_x86_sse41_pmovzxdq>, VEX; + int_x86_sse41_pmovzxdq, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; } let Predicates = [HasAVX2] in { defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", - int_x86_avx2_pmovsxbw>, VEX, VEX_L; + int_x86_avx2_pmovsxbw, + WriteShuffle>, VEX, VEX_L; defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", - int_x86_avx2_pmovsxwd>, VEX, VEX_L; + int_x86_avx2_pmovsxwd, + WriteShuffle>, VEX, VEX_L; defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", - int_x86_avx2_pmovsxdq>, VEX, VEX_L; + int_x86_avx2_pmovsxdq, + WriteShuffle>, VEX, VEX_L; defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", - int_x86_avx2_pmovzxbw>, VEX, VEX_L; + int_x86_avx2_pmovzxbw, + WriteShuffle>, VEX, VEX_L; defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", - int_x86_avx2_pmovzxwd>, VEX, VEX_L; + int_x86_avx2_pmovzxwd, + WriteShuffle>, VEX, VEX_L; defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", - int_x86_avx2_pmovzxdq>, VEX, VEX_L; -} - -defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, SSE_INTALU_ITINS_P>; -defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, SSE_INTALU_ITINS_P>; -defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, SSE_INTALU_ITINS_P>; -defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, SSE_INTALU_ITINS_P>; -defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, SSE_INTALU_ITINS_P>; -defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, SSE_INTALU_ITINS_P>; + int_x86_avx2_pmovzxdq, + WriteShuffle>, VEX, VEX_L; +} + +defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, + SSE_INTALU_ITINS_SHUFF_P>; +defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, + SSE_INTALU_ITINS_SHUFF_P>; +defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, + SSE_INTALU_ITINS_SHUFF_P>; +defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, + SSE_INTALU_ITINS_SHUFF_P>; +defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, + SSE_INTALU_ITINS_SHUFF_P>; +defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, + SSE_INTALU_ITINS_SHUFF_P>; let Predicates = [HasAVX] in { // Common patterns involving scalar load. @@ -5590,91 +6088,67 @@ let Predicates = [UseSSE41] in { (PMOVZXDQrm addr:$src)>; } -let Predicates = [HasAVX2] in { - let AddedComplexity = 15 in { - def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))), - (VPMOVZXDQYrr VR128:$src)>; - def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), - (VPMOVZXWDYrr VR128:$src)>; - def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))), - (VPMOVZXBWYrr VR128:$src)>; - } - - def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; - def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; - def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; -} - -let Predicates = [HasAVX] in { - def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; - def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; - def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; -} - -let Predicates = [UseSSE41] in { - def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; - def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; - def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; -} - - multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId, OpndItins itins = DEFAULT_ITINS> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize; + [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, + Sched<[itins.Sched]>; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], - itins.rm>, - OpSize; + itins.rm>, Sched<[itins.Sched.Folded]>; } multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId> { + Intrinsic IntId, X86FoldableSchedWrite Sched> { def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, - OpSize; + Sched<[Sched.Folded]>; } let Predicates = [HasAVX] in { -defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, - VEX; -defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, - VEX; -defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>, - VEX; -defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, - VEX; +defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; +defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; +defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; +defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq, + DEFAULT_ITINS_SHUFFLESCHED>, VEX; } let Predicates = [HasAVX2] in { defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", - int_x86_avx2_pmovsxbd>, VEX, VEX_L; + int_x86_avx2_pmovsxbd, WriteShuffle>, + VEX, VEX_L; defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", - int_x86_avx2_pmovsxwq>, VEX, VEX_L; + int_x86_avx2_pmovsxwq, WriteShuffle>, + VEX, VEX_L; defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", - int_x86_avx2_pmovzxbd>, VEX, VEX_L; + int_x86_avx2_pmovzxbd, WriteShuffle>, + VEX, VEX_L; defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", - int_x86_avx2_pmovzxwq>, VEX, VEX_L; + int_x86_avx2_pmovzxwq, WriteShuffle>, + VEX, VEX_L; } defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd, - SSE_INTALU_ITINS_P>; + SSE_INTALU_ITINS_SHUFF_P>; defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq, - SSE_INTALU_ITINS_P>; + SSE_INTALU_ITINS_SHUFF_P>; defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd, - SSE_INTALU_ITINS_P>; + SSE_INTALU_ITINS_SHUFF_P>; defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, - SSE_INTALU_ITINS_P>; + SSE_INTALU_ITINS_SHUFF_P>; let Predicates = [HasAVX] in { // Common patterns involving scalar load @@ -5703,49 +6177,49 @@ let Predicates = [UseSSE41] in { } multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { + X86FoldableSchedWrite Sched> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; // Expecting a i16 load any extended to i32 value. def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, - OpSize; + Sched<[Sched.Folded]>; } multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId> { + Intrinsic IntId, X86FoldableSchedWrite Sched> { def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; // Expecting a i16 load any extended to i32 value. def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, - OpSize; + Sched<[Sched.Folded]>; } let Predicates = [HasAVX] in { -defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, - VEX; -defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, - VEX; +defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq, + WriteShuffle>, VEX; +defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq, + WriteShuffle>, VEX; } let Predicates = [HasAVX2] in { -defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", - int_x86_avx2_pmovsxbq>, VEX, VEX_L; -defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", - int_x86_avx2_pmovzxbq>, VEX, VEX_L; +defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq, + WriteShuffle>, VEX, VEX_L; +defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq, + WriteShuffle>, VEX, VEX_L; } defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq, - SSE_INTALU_ITINS_P>; + WriteShuffle>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq, - SSE_INTALU_ITINS_P>; + WriteShuffle>; let Predicates = [HasAVX2] in { def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; @@ -5772,9 +6246,9 @@ let Predicates = [HasAVX2] in { def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))), + def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))), (VPMOVSXWDYrm addr:$src)>; - def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))), + def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))), (VPMOVSXDQYrm addr:$src)>; def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 @@ -6003,16 +6477,15 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, - OpSize; - let neverHasSideEffects = 1, mayStore = 1 in + Sched<[WriteShuffle]>; + let neverHasSideEffects = 1, mayStore = 1, + SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; -// FIXME: -// There's an AssertZext in the way of writing the store pattern -// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) + [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6023,22 +6496,21 @@ defm PEXTRB : SS41I_extract8<0x14, "pextrb">; /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { - let isCodeGenOnly = 1, hasSideEffects = 0 in + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; + []>, Sched<[WriteShuffle]>; - let neverHasSideEffects = 1, mayStore = 1 in + let neverHasSideEffects = 1, mayStore = 1, + SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; -// FIXME: -// There's an AssertZext in the way of writing the store pattern -// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) + [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6054,13 +6526,15 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32:$dst, - (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize; + (extractelt (v4i32 VR128:$src1), imm:$src2))]>, + Sched<[WriteShuffle]>; + let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v4i32 VR128:$src1), imm:$src2), - addr:$dst)]>, OpSize; + addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6075,13 +6549,15 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR64:$dst, - (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W; + (extractelt (v2i64 VR128:$src1), imm:$src2))]>, + Sched<[WriteShuffle]>, REX_W; + let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v2i64 VR128:$src1), imm:$src2), - addr:$dst)]>, OpSize, REX_W; + addr:$dst)]>, REX_W; } let Predicates = [HasAVX] in @@ -6099,14 +6575,14 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))], - itins.rr>, - OpSize; + itins.rr>, Sched<[WriteFBlend]>; + let SchedRW = [WriteFBlendLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), - addr:$dst)], itins.rm>, OpSize; + addr:$dst)], itins.rm>; } let ExeDomain = SSEPackedSingle in { @@ -6139,7 +6615,8 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize; + (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, + Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6148,7 +6625,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), - imm:$src3))]>, OpSize; + imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in @@ -6165,7 +6642,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - OpSize; + Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6174,7 +6651,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), - imm:$src3)))]>, OpSize; + imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in @@ -6191,7 +6668,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - OpSize; + Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6200,7 +6677,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), - imm:$src3)))]>, OpSize; + imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in @@ -6221,8 +6698,8 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, - OpSize; + (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, + Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), !if(Is2Addr, @@ -6230,9 +6707,10 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insrtps VR128:$src1, + (X86insertps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))], itins.rm>, OpSize; + imm:$src3))], itins.rm>, + Sched<[WriteFShuffleLd, ReadAfterLd]>; } let ExeDomain = SSEPackedSingle in { @@ -6242,6 +6720,29 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } +let Predicates = [UseSSE41] in { + // If we're inserting an element from a load or a null pshuf of a load, + // fold the load into the insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 + (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), + imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd + (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + +let Predicates = [UseAVX] in { + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// @@ -6258,8 +6759,7 @@ let ExeDomain = SSEPackedSingle in { !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], - IIC_SSE_ROUNDPS_REG>, - OpSize; + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; // Vector intrinsic operation, mem def PSm : SS4AIi8<opcps, MRMSrcMem, @@ -6268,8 +6768,7 @@ let ExeDomain = SSEPackedSingle in { "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], - IIC_SSE_ROUNDPS_MEM>, - OpSize; + IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>; } // ExeDomain = SSEPackedSingle let ExeDomain = SSEPackedDouble in { @@ -6279,8 +6778,7 @@ let ExeDomain = SSEPackedDouble in { !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], - IIC_SSE_ROUNDPS_REG>, - OpSize; + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; // Vector intrinsic operation, mem def PDm : SS4AIi8<opcpd, MRMSrcMem, @@ -6289,8 +6787,7 @@ let ExeDomain = SSEPackedDouble in { "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], - IIC_SSE_ROUNDPS_REG>, - OpSize; + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>; } // ExeDomain = SSEPackedDouble } @@ -6308,9 +6805,10 @@ let ExeDomain = GenericDomain in { "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - []>, OpSize; + []>, Sched<[WriteFAdd]>; // Intrinsic operation, reg. + let isCodeGenOnly = 1 in def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6319,7 +6817,7 @@ let ExeDomain = GenericDomain in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, - OpSize; + Sched<[WriteFAdd]>; // Intrinsic operation, mem. def SSm : SS4AIi8<opcss, MRMSrcMem, @@ -6331,7 +6829,7 @@ let ExeDomain = GenericDomain in { "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, - OpSize; + Sched<[WriteFAddLd, ReadAfterLd]>; // Operation, reg. let hasSideEffects = 0 in @@ -6342,9 +6840,10 @@ let ExeDomain = GenericDomain in { "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - []>, OpSize; + []>, Sched<[WriteFAdd]>; // Intrinsic operation, reg. + let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6353,7 +6852,7 @@ let ExeDomain = GenericDomain in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, - OpSize; + Sched<[WriteFAdd]>; // Intrinsic operation, mem. def SDm : SS4AIi8<opcsd, MRMSrcMem, @@ -6365,7 +6864,7 @@ let ExeDomain = GenericDomain in { "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, - OpSize; + Sched<[WriteFAddLd, ReadAfterLd]>; } // ExeDomain = GenericDomain } @@ -6512,31 +7011,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, - OpSize, VEX; + Sched<[WriteVecLogic]>, VEX; def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, - OpSize, VEX; + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, - OpSize, VEX, VEX_L; + Sched<[WriteVecLogic]>, VEX, VEX_L; def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, - OpSize, VEX, VEX_L; + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L; } let Defs = [EFLAGS] in { def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "ptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, - OpSize; + Sched<[WriteVecLogic]>; def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "ptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, - OpSize; + Sched<[WriteVecLogicLd, ReadAfterLd]>; } // The bit test instructions below are AVX only @@ -6544,11 +7043,12 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), - [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX; + [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, + Sched<[WriteVecLogic]>, VEX; def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, - OpSize, VEX; + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; } let Defs = [EFLAGS], Predicates = [HasAVX] in { @@ -6572,56 +7072,65 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "popcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)], - IIC_SSE_POPCNT_RR>, - OpSize, XS; + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, + OpSize16, XS; def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "popcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctpop (loadi16 addr:$src))), - (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, OpSize16, XS; def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "popcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)], - IIC_SSE_POPCNT_RR>, - XS; + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, + OpSize32, XS; + def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "popcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctpop (loadi32 addr:$src))), - (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, OpSize32, XS; def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "popcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)], - IIC_SSE_POPCNT_RR>, - XS; + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS; def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "popcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctpop (loadi64 addr:$src))), - (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, XS; } // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, - Intrinsic IntId128> { + Intrinsic IntId128, + X86FoldableSchedWrite Sched> { def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize; + [(set VR128:$dst, (IntId128 VR128:$src))]>, + Sched<[Sched]>; def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId128 - (bitconvert (memopv2i64 addr:$src))))]>, OpSize; + (IntId128 (bitconvert (memopv2i64 addr:$src))))]>, + Sched<[Sched.Folded]>; } +// PHMIN has the same profile as PSAD, thus we use the same scheduling +// model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw>, VEX; + int_x86_sse41_phminposuw, + WriteVecIMul>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw>; + int_x86_sse41_phminposuw, + WriteVecIMul>; /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, @@ -6634,32 +7143,33 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], - itins.rr>, OpSize; + itins.rr>, Sched<[itins.Sched]>; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))], - itins.rm>, OpSize; + (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId256> { + Intrinsic IntId256, + X86FoldableSchedWrite Sched> { let isCommutable = 1 in def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize; + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, + Sched<[Sched]>; def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (IntId256 VR256:$src1, - (bitconvert (loadv4i64 addr:$src2))))]>, OpSize; + (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, + Sched<[Sched.Folded, ReadAfterLd]>; } @@ -6667,75 +7177,114 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, bit Is2Addr = 1, - OpndItins itins = DEFAULT_ITINS> { + OpndItins itins = SSE_INTALU_ITINS_P> { let isCommutable = 1 in def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize; + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + Sched<[itins.Sched]>; def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)))))]>, OpSize; + (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst +/// types. +multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, - loadv2i64, i128mem, 0>, VEX_4V; - defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, - 0>, VEX_4V; + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, + VR128, loadv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; } let Predicates = [HasAVX2] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw>, VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; - defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", - int_x86_avx2_pmul_dq>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, + VR256, loadv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, @@ -6752,21 +7301,26 @@ let Constraints = "$src1 = $dst" in { memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, - 1, SSE_INTMUL_ITINS_P>; + defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; } let Predicates = [HasAVX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -6790,7 +7344,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, - OpSize; + Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), !if(Is2Addr, @@ -6801,47 +7355,58 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>, - OpSize; + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0>, VEX_4V; + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0>, VEX_4V, VEX_L; + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0>, VEX_4V; + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0>, VEX_4V, VEX_L; + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, loadv2i64, i128mem, 0>, VEX_4V; + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V; defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0>, VEX_4V; + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, loadv4f32, f128mem, 0>, VEX_4V; + VR128, loadv4f32, f128mem, 0, + SSE_DPPS_ITINS>, VEX_4V; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, loadv2f64, f128mem, 0>, VEX_4V; + VR128, loadv2f64, f128mem, 0, + SSE_DPPS_ITINS>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv8f32, i256mem, 0, + SSE_DPPS_ITINS>, VEX_4V, VEX_L; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } } @@ -6850,17 +7415,17 @@ let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_P>; + 1, SSE_INTALU_ITINS_FBLEND_P>; let ExeDomain = SSEPackedDouble in defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_P>; + 1, SSE_INTALU_ITINS_FBLEND_P>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_P>; + 1, SSE_INTALU_ITINS_FBLEND_P>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, VR128, memopv2i64, i128mem, - 1, SSE_INTMUL_ITINS_P>; + 1, SSE_MPSADBW_ITINS>; } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, @@ -6875,13 +7440,15 @@ let Constraints = "$src1 = $dst" in { /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, Intrinsic IntId> { + PatFrag mem_frag, Intrinsic IntId, + X86FoldableSchedWrite Sched> { def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], - NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + Sched<[Sched]>; def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), @@ -6890,29 +7457,36 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), RC:$src3))], - NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + Sched<[Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - loadv2f64, int_x86_sse41_blendvpd>; + loadv2f64, int_x86_sse41_blendvpd, + WriteFVarBlend>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L; + loadv4f64, int_x86_avx_blendv_pd_256, + WriteFVarBlend>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - loadv4f32, int_x86_sse41_blendvps>; + loadv4f32, int_x86_sse41_blendvps, + WriteFVarBlend>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L; + loadv8f32, int_x86_avx_blendv_ps_256, + WriteFVarBlend>, VEX_L; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - loadv2i64, int_x86_sse41_pblendvb>; + loadv2i64, int_x86_sse41_pblendvb, + WriteVarBlend>; } let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - loadv4i64, int_x86_avx2_pblendvb>, VEX_L; + loadv4i64, int_x86_avx2_pblendvb, + WriteVarBlend>, VEX_L; } let Predicates = [HasAVX] in { @@ -6981,7 +7555,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], - itins.rr>, OpSize; + itins.rr>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), @@ -6990,7 +7564,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (mem_frag addr:$src2)), XMM0))], - itins.rm>, OpSize; + itins.rm>; } } @@ -7046,20 +7620,21 @@ let Predicates = [UseSSE41] in { } +let SchedRW = [WriteLoad] in { let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, - OpSize, VEX; + VEX; let Predicates = [HasAVX2] in def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, - OpSize, VEX, VEX_L; + VEX, VEX_L; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, - OpSize; + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; +} // SchedRW //===----------------------------------------------------------------------===// // SSE4.2 - Compare Instructions @@ -7074,15 +7649,14 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, - OpSize; + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>; def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>; } let Predicates = [HasAVX] in @@ -7122,12 +7696,12 @@ multiclass pcmpistrm_SS42AI<string asm> { def rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; + []>, Sched<[WritePCmpIStrM]>; let mayLoad = 1 in def rm :SS42AI<0x62, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; + []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; } let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { @@ -7157,12 +7731,12 @@ multiclass SS42AI_pcmpestrm<string asm> { def rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; + []>, Sched<[WritePCmpEStrM]>; let mayLoad = 1 in def rm : SS42AI<0x60, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, i8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; + []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; } let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { @@ -7192,12 +7766,12 @@ multiclass SS42AI_pcmpistri<string asm> { def rr : SS42AI<0x63, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; + []>, Sched<[WritePCmpIStrI]>; let mayLoad = 1 in def rm : SS42AI<0x63, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; + []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; } let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { @@ -7228,12 +7802,12 @@ multiclass SS42AI_pcmpestri<string asm> { def rr : SS42AI<0x61, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; + []>, Sched<[WritePCmpEStrI]>; let mayLoad = 1 in def rm : SS42AI<0x61, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, i8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; + []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; } let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { @@ -7255,14 +7829,15 @@ class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, RegisterClass RCIn, SDPatternOperator Int> : SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), - [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>; + [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>, + Sched<[WriteFAdd]>; class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, X86MemOperand x86memop, SDPatternOperator Int> : SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))], - IIC_CRC32_MEM>; + IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, @@ -7270,13 +7845,13 @@ let Constraints = "$src1 = $dst" in { def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, int_x86_sse42_crc32_32_8>; def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, - int_x86_sse42_crc32_32_16>, OpSize; + int_x86_sse42_crc32_32_16>, OpSize16; def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, - int_x86_sse42_crc32_32_16>, OpSize; + int_x86_sse42_crc32_32_16>, OpSize16; def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, - int_x86_sse42_crc32_32_32>; + int_x86_sse42_crc32_32_32>, OpSize32; def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, - int_x86_sse42_crc32_32_32>; + int_x86_sse42_crc32_32_32>, OpSize32; def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, int_x86_sse42_crc32_64_64>, REX_W; def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, @@ -7357,14 +7932,15 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, - OpSize; + Sched<[WriteAESDecEnc]>; def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize; + (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, + Sched<[WriteAESDecEncLd, ReadAfterLd]>; } // Perform One Round of an AES Encryption/Decryption Flow @@ -7396,25 +7972,24 @@ let Predicates = [HasAVX, HasAES] in { (ins VR128:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, - (int_x86_aesni_aesimc VR128:$src1))]>, - OpSize, VEX; + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, + VEX; def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, - OpSize, VEX; + Sched<[WriteAESIMCLd]>, VEX; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1), "aesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, - (int_x86_aesni_aesimc VR128:$src1))]>, - OpSize; + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "aesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, - OpSize; + Sched<[WriteAESIMCLd]>; // AES Round Key Generation Assist let Predicates = [HasAVX, HasAES] in { @@ -7423,26 +7998,26 @@ let Predicates = [HasAVX, HasAES] in { "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, - OpSize, VEX; + Sched<[WriteAESKeyGen]>, VEX; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, - OpSize, VEX; + Sched<[WriteAESKeyGenLd]>, VEX; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, - OpSize; + Sched<[WriteAESKeyGen]>; def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, - OpSize; + Sched<[WriteAESKeyGenLd]>; //===----------------------------------------------------------------------===// // PCLMUL Instructions @@ -7453,13 +8028,15 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (loadv2i64 addr:$src2), imm:$src3))]>; + (loadv2i64 addr:$src2), imm:$src3))]>, + Sched<[WriteCLMulLd, ReadAfterLd]>; // Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { @@ -7468,31 +8045,34 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], - IIC_SSE_PCLMULQDQ_RR>; + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), imm:$src3))], - IIC_SSE_PCLMULQDQ_RM>; + IIC_SSE_PCLMULQDQ_RM>, + Sched<[WriteCLMulLd, ReadAfterLd]>; } // Constraints = "$src1 = $dst" multiclass pclmul_alias<string asm, int immop> { def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>; + (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>; + (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>; + (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), + 0>; def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>; + (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), + 0>; } defm : pclmul_alias<"hqhq", 0x11>; defm : pclmul_alias<"hqlq", 0x01>; @@ -7506,16 +8086,16 @@ defm : pclmul_alias<"lqlq", 0x00>; let Predicates = [HasSSE4A] in { let Constraints = "$src = $dst" in { -def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst), +def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, i8imm:$len, i8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, - imm:$idx))]>, TB, OpSize; + imm:$idx))]>, PD; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), "extrq\t{$mask, $src|$src, $mask}", [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, - VR128:$mask))]>, TB, OpSize; + VR128:$mask))]>, PD; def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), @@ -7547,43 +8127,59 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // destination operand // class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int> : + X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; + +class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int addr:$src))]>, VEX; + [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[Sched]>, VEX { + let mayLoad = 1; +} // AVX2 adds register forms class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int> : + Intrinsic Int, SchedWrite Sched> : AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int VR128:$src))]>, VEX; + [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, - int_x86_avx_vbroadcast_ss>; - def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcast_ss_256>, VEX_L; + def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, WriteLoad>; + def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256>, VEX_L; +def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, WriteFShuffleLd>, VEX_L; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256>, VEX_L; + int_x86_avx_vbroadcastf128_pd_256, + WriteFShuffleLd>, VEX_L; let ExeDomain = SSEPackedSingle in { def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, - int_x86_avx2_vbroadcast_ss_ps>; + int_x86_avx2_vbroadcast_ss_ps, + WriteFShuffle>; def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L; + int_x86_avx2_vbroadcast_ss_ps_256, + WriteFShuffle256>, VEX_L; } let ExeDomain = SSEPackedDouble in def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L; + int_x86_avx2_vbroadcast_sd_pd_256, + WriteFShuffle256>, VEX_L; let Predicates = [HasAVX2] in def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128>, VEX_L; + int_x86_avx2_vbroadcasti128, WriteLoad>, + VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), @@ -7597,12 +8193,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V, VEX_L; + []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f128mem:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V, VEX_L; + []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } let Predicates = [HasAVX] in { @@ -7671,12 +8267,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX, VEX_L; + []>, Sched<[WriteFShuffle]>, VEX, VEX_L; let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), (ins f128mem:$dst, VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX, VEX_L; + []>, Sched<[WriteStore]>, VEX, VEX_L; } // AVX1 patterns @@ -7785,22 +8381,26 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V; + [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V, + Sched<[WriteFShuffle]>; def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop_i:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (IntVar RC:$src1, - (bitconvert (i_frag addr:$src2))))]>, VEX_4V; + (bitconvert (i_frag addr:$src2))))]>, VEX_4V, + Sched<[WriteFShuffleLd, ReadAfterLd]>; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX; + [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX, + Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX; + (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + Sched<[WriteFShuffleLd]>; } let ExeDomain = SSEPackedSingle in { @@ -7841,12 +8441,14 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V, VEX_L; + (i8 imm:$src3))))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffle]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V, VEX_L; + (i8 imm:$src3)))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { @@ -7888,11 +8490,11 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { // Zero All YMM registers def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", - [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>; + [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>; // Zero Upper bits of YMM registers def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", - [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>; + [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>; } //===----------------------------------------------------------------------===// @@ -7902,10 +8504,11 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (Int VR128:$src))]>, - T8, OpSize, VEX; + T8PD, VEX, Sched<[WriteCvtF2F]>; let neverHasSideEffects = 1, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, + Sched<[WriteCvtF2FLd]>; } multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { @@ -7913,12 +8516,13 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { (ins RC:$src1, i32i8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, - TA, OpSize, VEX; - let neverHasSideEffects = 1, mayStore = 1 in + TAPD, VEX, Sched<[WriteCvtF2F]>; + let neverHasSideEffects = 1, mayStore = 1, + SchedRW = [WriteCvtF2FLd, WriteRMW] in def mr : Ii8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TA, OpSize, VEX; + TAPD, VEX; } let Predicates = [HasF16C] in { @@ -7926,6 +8530,27 @@ let Predicates = [HasF16C] in { defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; +} + +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(fp_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; } //===----------------------------------------------------------------------===// @@ -7942,7 +8567,7 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, - VEX_4V; + Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), !strconcat(OpcodeStr, @@ -7950,7 +8575,7 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, - VEX_4V; + Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } let isCommutable = 0 in { @@ -7976,19 +8601,22 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256> { def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int128 VR128:$src))]>, VEX; + [(set VR128:$dst, (Int128 VR128:$src))]>, + Sched<[WriteShuffle]>, VEX; def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; + (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, + Sched<[WriteLoad]>, VEX; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L; + [(set VR256:$dst, (Int256 VR128:$src))]>, + Sched<[WriteShuffle256]>, VEX, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, - VEX, VEX_L; + Sched<[WriteLoad]>, VEX, VEX_L; } defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, @@ -8063,6 +8691,31 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + + def : Pat<(v16i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + def : Pat<(v32i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + + def : Pat<(v8i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + def : Pat<(v16i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + + // The patterns for VPBROADCASTD are not needed because they would match + // the exact same thing as VBROADCASTSS patterns. + + def : Pat<(v2i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. } } @@ -8077,13 +8730,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), } let Predicates = [HasAVX] in { -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8124,7 +8770,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, - VEX_4V, VEX_L; + Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, @@ -8132,7 +8778,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - VEX_4V, VEX_L; + Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; @@ -8147,14 +8793,15 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, - VEX, VEX_L; + Sched<[WriteShuffle256]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), - (i8 imm:$src2))))]>, VEX, VEX_L; + (i8 imm:$src2))))]>, + Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L; } defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; @@ -8168,12 +8815,14 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V, VEX_L; + (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, + VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V, VEX_L; + (i8 imm:$src3)))]>, + Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), @@ -8202,12 +8851,12 @@ let neverHasSideEffects = 1 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V, VEX_L; + []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i128mem:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V, VEX_L; + []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } let Predicates = [HasAVX2] in { @@ -8257,12 +8906,12 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, - VEX, VEX_L; + Sched<[WriteShuffle256]>, VEX, VEX_L; let neverHasSideEffects = 1, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX, VEX_L; + Sched<[WriteStore]>, VEX, VEX_L; let Predicates = [HasAVX2] in { def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), @@ -8347,27 +8996,27 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, - VEX_4V; + VEX_4V, Sched<[WriteVarVecShift]>; def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, - VEX_4V; + VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, Sched<[WriteVarVecShift]>; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; } defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; @@ -8395,12 +9044,18 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, let mayLoad = 1, Constraints = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; + } } |