summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86InstrSSE.td
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86InstrSSE.td')
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td2029
1 files changed, 988 insertions, 1041 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index f91764a..1812d01 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
InstrItinClass ri = arg_ri;
}
-
// scalar
let Sched = WriteFAdd in {
def SSE_ALU_F32S : OpndItins<
@@ -259,26 +258,24 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
-multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
- string asm, string SSEVer, string FPSizeStr,
- Operand memopr, ComplexPattern mem_cpat,
- Domain d, OpndItins itins, bit Is2Addr = 1> {
-let isCodeGenOnly = 1 in {
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator Int, RegisterClass RC,
+ string asm, Operand memopr,
+ ComplexPattern mem_cpat, Domain d,
+ OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))], itins.rr, d>,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
Sched<[itins.Sched]>;
+ let mayLoad = 1 in
def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
- SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+ [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -372,13 +369,9 @@ def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
(COPY_TO_REGCLASS FR32:$src, VR128)>;
-def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
- (COPY_TO_REGCLASS FR32:$src, VR128)>;
// Implicitly promote a 64-bit scalar to a vector.
def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
(COPY_TO_REGCLASS FR64:$src, VR128)>;
-def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
- (COPY_TO_REGCLASS FR64:$src, VR128)>;
// Bitcasts between 128-bit vector types. Return the original type since
// no instruction is needed for the conversion
@@ -453,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
- [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
- [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
}
//===----------------------------------------------------------------------===//
@@ -512,6 +505,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string base_opc,
string asm_opr, Domain d = GenericDomain> {
+ let isCommutable = 1 in
def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, RC:$src2),
!strconcat(base_opc, asm_opr),
@@ -590,6 +584,8 @@ let Predicates = [UseAVX] in {
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
@@ -609,6 +605,8 @@ let Predicates = [UseAVX] in {
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
@@ -697,6 +695,8 @@ let Predicates = [UseSSE1] in {
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
}
// Extract and store.
@@ -771,13 +771,12 @@ def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d,
- OpndItins itins,
- bit IsReMaterializable = 1> {
+ OpndItins itins> {
let hasSideEffects = 0 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
Sched<[WriteFShuffle]>;
-let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
@@ -795,7 +794,7 @@ defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
PS, VEX;
defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD, VEX;
defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
@@ -808,7 +807,7 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
PS, VEX, VEX_L;
defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD, VEX, VEX_L;
}
@@ -825,7 +824,7 @@ defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
PD;
defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD;
}
@@ -1028,7 +1027,7 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
}
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let Predicates = [HasAVX, NoVLX] in {
// 128-bit load/store
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
@@ -1077,29 +1076,6 @@ let Predicates = [UseSSE1] in {
(MOVUPSmr addr:$dst, VR128:$src)>;
}
-// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
-// bits are disregarded. FIXME: Set encoding to pseudo!
-let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
-let isCodeGenOnly = 1 in {
- def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
- "movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
- IIC_SSE_MOVA_P_RM>, VEX;
- def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
- "movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
- IIC_SSE_MOVA_P_RM>, VEX;
- def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
- "movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
- IIC_SSE_MOVA_P_RM>;
- def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
- "movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
- IIC_SSE_MOVA_P_RM>;
-}
-}
-
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
@@ -1300,6 +1276,7 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1307,6 +1284,11 @@ let Predicates = [UseAVX] in {
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
@@ -1332,6 +1314,7 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1339,6 +1322,11 @@ let Predicates = [UseSSE2] in {
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
@@ -1371,6 +1359,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ let isCommutable = 1 in
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $dst|$dst, $src2}",
@@ -1449,15 +1438,18 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
itins.rm>, Sched<[itins.Sched.Folded]>;
}
-multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm, Domain d,
- OpndItins itins> {
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
+ ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
+ string asm, Domain d, OpndItins itins> {
let hasSideEffects = 0 in {
- def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [], itins.rr, d>, Sched<[itins.Sched]>;
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
+ itins.rr, d>, Sched<[itins.Sched]>;
let mayLoad = 1 in
- def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp
+ (SrcTy (bitconvert (ld_frag addr:$src))))))],
+ itins.rm, d>, Sched<[itins.Sched.Folded]>;
}
}
@@ -1730,16 +1722,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_64>, XS, REX_W;
-defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
- PS, VEX, Requires<[HasAVX]>;
-defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+ PS, VEX, Requires<[HasAVX, NoVLX]>;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
- PS, VEX, VEX_L, Requires<[HasAVX]>;
+ PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
PS, Requires<[UseSSE2]>;
@@ -1798,16 +1790,16 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
-def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
Requires<[UseAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround FR64:$src))],
+ [(set FR32:$dst, (fpround FR64:$src))],
IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+ [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
IIC_SSE_CVT_Scalar_RM>,
XD,
Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
@@ -1864,9 +1856,9 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
-def : Pat<(f64 (fextend FR32:$src)),
+def : Pat<(f64 (fpextend FR32:$src)),
(VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
-def : Pat<(fextend (loadf32 addr:$src)),
+def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
def : Pat<(extloadf32 addr:$src),
@@ -1878,7 +1870,7 @@ def : Pat<(extloadf32 addr:$src),
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fextend FR32:$src))],
+ [(set FR64:$dst, (fpextend FR32:$src))],
IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
@@ -1887,12 +1879,12 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
-// extload f32 -> f64. This matches load+fextend because we have a hack in
+// extload f32 -> f64. This matches load+fpextend because we have a hack in
// the isel (PreprocessForFPConvert) that can introduce loads after dag
// combine.
-// Since these loads aren't folded into the fextend, we have to match it
+// Since these loads aren't folded into the fpextend, we have to match it
// explicitly here.
-def : Pat<(fextend (loadf32 addr:$src)),
+def : Pat<(fpextend (loadf32 addr:$src)),
(CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
def : Pat<(extloadf32 addr:$src),
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
@@ -1930,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
}
} // isCodeGenOnly = 1
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1962,134 +2027,98 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
VEX, Sched<[WriteCvtF2I]>;
// XMM only
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
-def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
- Sched<[WriteCvtF2ILd]>;
+def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
- Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
+ (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
-def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
}
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
+let Predicates = [HasAVX, NoVLX] in {
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq VR128:$src))],
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttps2dq
- (loadv4f32 addr:$src)))],
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
- (loadv8f32 addr:$src)))],
+ [(set VR256:$dst,
+ (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;
+}
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
-let Predicates = [HasAVX] in {
- def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
- (VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
- (VCVTDQ2PSrm addr:$src)>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PSrm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (VCVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQrm addr:$src)>;
-
- def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
- (VCVTDQ2PSYrr VR256:$src)>;
- def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
- (VCVTDQ2PSYrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
- (VCVTTPS2DQYrr VR256:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQYrm addr:$src)>;
-}
-
-let Predicates = [UseSSE2] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (CVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
- (CVTDQ2PSrm addr:$src)>;
-
- def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
- (CVTDQ2PSrr VR128:$src)>;
- def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
- (CVTDQ2PSrm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (CVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
- (CVTTPS2DQrm addr:$src)>;
-}
-
+let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
@@ -2098,66 +2127,92 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
-def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (loadv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
// YMM only
+let Predicates = [HasAVX, NoVLX] in {
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
-def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+}
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
- (VCVTTPD2DQYrr VR256:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2DQYrm addr:$src)>;
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (VCVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (VCVTTPD2DQrr VR128:$src)>;
+ }
} // Predicates = [HasAVX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (CVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (CVTTPD2DQrr VR128:$src)>;
+ }
+} // Predicates = [UseSSE2]
// Convert packed single to packed double
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+ [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+ [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
}
let Predicates = [UseSSE2] in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
@@ -2165,136 +2220,118 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
}
// Convert Packed DW Integers to Packed Double FP
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, Sched<[WriteCvtI2FLd]>;
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, Sched<[WriteCvtI2F]>;
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtI2F]>;
def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, VEX_L, Sched<[WriteCvtI2F]>;
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2F]>;
}
let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
// AVX register conversion intrinsics
-let Predicates = [HasAVX] in {
- def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
- (VCVTDQ2PDrr VR128:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTDQ2PDrm addr:$src)>;
-
- def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
- (VCVTDQ2PDYrr VR128:$src)>;
- def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PDYrm addr:$src)>;
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
-let Predicates = [HasSSE2] in {
- def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
- (CVTDQ2PDrr VR128:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
- (CVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(CVTDQ2PDrm addr:$src)>;
-} // Predicates = [HasSSE2]
+} // Predicates = [UseSSE2]
// Convert packed double to packed single
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
// XMM only
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
(VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
-def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2psx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
// YMM only
+let Predicates = [HasAVX, NoVLX] in {
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (fpround VR256:$src))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
+ [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
-def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+}
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
(VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+ [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
-
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
// whenever possible to avoid declaring two versions of each one.
-let Predicates = [HasAVX] in {
- def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
- (VCVTDQ2PSYrr VR256:$src)>;
- def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
- (VCVTDQ2PSYrm addr:$src)>;
-}
let Predicates = [HasAVX, NoVLX] in {
- // Match fround and fextend for 128/256-bit conversions
- def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ // Match fpround and fpextend for 128/256-bit conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(VCVTPD2PSrr VR128:$src)>;
- def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
- (VCVTPD2PSXrm addr:$src)>;
- def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
- (VCVTPD2PSYrr VR256:$src)>;
- def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
- (VCVTPD2PSYrm addr:$src)>;
-
- def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
- (VCVTPS2PDrr VR128:$src)>;
- def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
- (VCVTPS2PDYrr VR128:$src)>;
- def : Pat<(v4f64 (extloadv4f32 addr:$src)),
- (VCVTPS2PDYrm addr:$src)>;
}
let Predicates = [UseSSE2] in {
- // Match fround and fextend for 128 conversions
- def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ // Match fpround and fpextend for 128 conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(CVTPD2PSrr VR128:$src)>;
- def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
- (CVTPD2PSrm addr:$src)>;
-
- def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
- (CVTPS2PDrr VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -2306,6 +2343,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
Operand CC, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm, string asm_alt,
OpndItins itins, ImmLeaf immLeaf> {
+ let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
@@ -2351,9 +2389,9 @@ let Constraints = "$src1 = $dst" in {
SSE_ALU_F64S, i8immZExt3>, XD;
}
-multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
Intrinsic Int, string asm, OpndItins itins,
- ImmLeaf immLeaf> {
+ ImmLeaf immLeaf, ComplexPattern mem_cpat> {
def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
@@ -2361,30 +2399,30 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
itins.rr>,
Sched<[itins.Sched]>;
def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+ (ins VR128:$src1, memop:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- (load addr:$src), immLeaf:$cc))],
+ mem_cpat:$src, immLeaf:$cc))],
itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let isCodeGenOnly = 1 in {
// Aliases to match intrinsics which expect XMM operand(s).
- defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+ defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5>,
+ SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
XS, VEX_4V;
- defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+ defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5>, // same latency as f32
+ SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
- defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+ defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SSE_ALU_F32S, i8immZExt3>, XS;
- defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+ SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+ defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SSE_ALU_F64S, i8immZExt3>,
+ SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
XD;
}
}
@@ -2407,6 +2445,23 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
Sched<[WriteFAddLd, ReadAfterLd]>;
}
+// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
+multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, Operand memop,
+ ComplexPattern mem_cpat, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR>,
+ Sched<[WriteFAdd]>;
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ mem_cpat:$src2))],
+ IIC_SSE_COMIS_RM>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS, VEX, VEX_LIG;
@@ -2420,15 +2475,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS, VEX;
- defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD, VEX;
-
- defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
- load, "comiss">, PS, VEX;
- defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
- load, "comisd">, PD, VEX;
+ defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD, VEX;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS;
@@ -2443,15 +2498,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS;
- defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD;
-
- defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
- "comiss">, PS;
- defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
- "comisd">, PD;
+ defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS;
+ defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD;
+
+ defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS;
+ defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD;
}
} // Defs = [EFLAGS]
@@ -2641,7 +2696,8 @@ let Predicates = [UseSSE2] in {
multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
PatFrag mem_frag, RegisterClass RC,
X86MemOperand x86memop, string asm,
- Domain d> {
+ Domain d, bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
def rr : PI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
asm, [(set RC:$dst,
@@ -2689,7 +2745,7 @@ let Constraints = "$src1 = $dst" in {
SSEPackedSingle>, PS;
defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble>, PD;
+ SSEPackedDouble, 1>, PD;
defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
SSEPackedSingle>, PS;
@@ -2810,84 +2866,6 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
// SSE 1 & 2 - Logical Instructions
//===----------------------------------------------------------------------===//
-// Multiclass for scalars using the X86 logical operation aliases for FP.
-multiclass sse12_fp_packed_scalar_logical_alias<
- bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
- defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
- PS, VEX_4V;
-
- defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
- PD, VEX_4V;
-
- let Constraints = "$src1 = $dst" in {
- defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
- f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
-
- defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
- f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
- }
-}
-
-let isCodeGenOnly = 1 in {
- defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
- SSE_BIT_ITINS_P>;
- defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
- SSE_BIT_ITINS_P>;
- defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
- SSE_BIT_ITINS_P>;
-
- let isCommutable = 0 in
- defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
- SSE_BIT_ITINS_P>;
-}
-
-// Multiclass for vectors using the X86 logical operation aliases for FP.
-multiclass sse12_fp_packed_vector_logical_alias<
- bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
- let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
- defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
- PS, VEX_4V;
-
- defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
- PD, VEX_4V;
-
- defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
- PS, VEX_4V, VEX_L;
-
- defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
- PD, VEX_4V, VEX_L;
- }
-
- let Constraints = "$src1 = $dst" in {
- defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
- v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
- PS;
-
- defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
- v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
- PD;
- }
-}
-
-let isCodeGenOnly = 1 in {
- defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
- SSE_BIT_ITINS_P>;
- defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
- SSE_BIT_ITINS_P>;
- defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
- SSE_BIT_ITINS_P>;
-
- let isCommutable = 0 in
- defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
- SSE_BIT_ITINS_P>;
-}
-
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
///
multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
@@ -2895,7 +2873,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f256mem,
- [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (bc_v4i64 (v8f32 VR256:$src2))))],
[(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
(loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
@@ -2907,12 +2886,10 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
(loadv4i64 addr:$src2)))], 0>,
PD, VEX_4V, VEX_L;
- // In AVX no need to add a pattern for 128-bit logical rr ps, because they
- // are all promoted to v2i64, and the patterns are covered by the int
- // version. This is needed in SSE only, because v2i64 isn't supported on
- // SSE1, but only on SSE2.
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f128mem, [],
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
(loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
@@ -2928,7 +2905,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem,
- [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
(memopv2i64 addr:$src2)))]>, PS;
@@ -2947,19 +2925,124 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
let isCommutable = 0 in
defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
-// AVX1 requires type coercions in order to fold loads directly into logical
-// operations.
+// If only AVX1 is supported, we need to handle integer operations with
+// floating point instructions since the integer versions aren't available.
let Predicates = [HasAVX1Only] in {
- def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
(VANDPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
(VORPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
(VXORPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
(VANDNPSYrm VR256:$src1, addr:$src2)>;
}
+let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VXORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VXORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (XORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (XORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+}
+
+// Patterns for packed operations when we don't have integer type available.
+def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
+ (ANDPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
+ (ORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
+ (XORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
+ (ANDNPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
+ (ORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
+ (XORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDNPSrm VR128:$src1, addr:$src2)>;
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Arithmetic Instructions
//===----------------------------------------------------------------------===//
@@ -3025,20 +3108,22 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator IntSS,
+ SDPatternOperator IntSD,
SizeItins itins> {
- defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
- defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
let Constraints = "$src1 = $dst" in {
- defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
SSEPackedSingle, itins.s>, XS;
- defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
SSEPackedDouble, itins.d>, XD;
}
}
@@ -3046,23 +3131,29 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
// Binary Arithmetic instructions
defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
+ SSE_MUL_ITINS_S>;
let isCommutable = 0 in {
defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
+ SSE_DIV_ITINS_S>;
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
+ int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
+ int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
}
let isCodeGenOnly = 1 in {
@@ -3145,9 +3236,15 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
}
- // Repeat everything for AVX, except for the movss + scalar combo...
- // because that one shouldn't occur with AVX codegen?
- let Predicates = [HasAVX] in {
+ // Repeat everything for AVX.
+ let Predicates = [UseAVX] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
// extracted scalar math op with insert via blend
def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
@@ -3203,7 +3300,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
}
// Repeat everything for AVX.
- let Predicates = [HasAVX] in {
+ let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
@@ -3287,8 +3384,8 @@ def SSE_RCPS : OpndItins<
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop, Operand vec_memop,
- ComplexPattern mem_cpat, Intrinsic Intr,
+ X86MemOperand x86memop,
+ Intrinsic Intr,
SDNode OpNode, Domain d, OpndItins itins,
Predicate target, string Suffix> {
let hasSideEffects = 0 in {
@@ -3308,23 +3405,17 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let mayLoad = 1 in
- def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
let Predicates = [target] in {
- def : Pat<(vt (OpNode mem_cpat:$src)),
- (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
// These are unary operations, but they are modeled as having 2 source operands
// because the high elements of the destination are unchanged in SSE.
def : Pat<(Intr VR128:$src),
(!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
- def : Pat<(Intr (load addr:$src)),
- (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
- addr:$src), VR128))>;
}
// We don't want to fold scalar loads into these instructions unless
// optimizing for size. This is because the folded instruction will have a
@@ -3334,16 +3425,15 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// which has a clobber before the rcp, vs.
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr mem_cpat:$src),
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>(NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
}
}
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop, Operand vec_memop,
- ComplexPattern mem_cpat,
+ X86MemOperand x86memop,
Intrinsic Intr, SDNode OpNode, Domain d,
OpndItins itins, string Suffix> {
let hasSideEffects = 0 in {
@@ -3361,7 +3451,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[]>, Sched<[itins.Sched.Folded]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, vec_memop:$src2),
+ (ins VR128:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3382,21 +3472,18 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
- (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
+ (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
VR128:$src)>;
}
let Predicates = [HasAVX, OptForSize] in {
- def : Pat<(Intr mem_cpat:$src),
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
- def : Pat<(vt (OpNode mem_cpat:$src)),
- (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
- mem_cpat:$src)>;
}
}
@@ -3475,11 +3562,10 @@ let Predicates = [HasAVX] in {
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
- ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem, ssmem, sse_load_f32,
+ f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
}
@@ -3487,11 +3573,10 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
- sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem, sdmem, sse_load_f64,
+ f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, "SD">,
XD, VEX_4V, VEX_LIG;
@@ -3805,13 +3890,14 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
}
let SchedRW = [WriteMove] in {
-let hasSideEffects = 0 in
+let hasSideEffects = 0 in {
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
// For Disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
@@ -3874,85 +3960,12 @@ def SSE_PMADD : OpndItins<
let ExeDomain = SSEPackedInt in { // SSE integer instructions
-multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
- RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0,
- bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
- def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
-multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
- Intrinsic IntId256, OpndItins itins,
- bit IsCommutable = 0> {
-let Predicates = [HasAVX] in
- defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
- VR128, loadv2i64, i128mem, itins,
- IsCommutable, 0>, VEX_4V;
-
-let Constraints = "$src1 = $dst" in
- defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
- i128mem, itins, IsCommutable, 1>;
-
-let Predicates = [HasAVX2] in
- defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
- VR256, loadv4i64, i256mem, itins,
- IsCommutable, 0>, VEX_4V, VEX_L;
-}
-
-multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
- string OpcodeStr, SDNode OpNode,
- SDNode OpNode2, RegisterClass RC,
- ValueType DstVT, ValueType SrcVT,
- PatFrag ld_frag, ShiftOpndItins itins,
- bit Is2Addr = 1> {
- // src2 is always 128-bit
- def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, VR128:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
- itins.rr>, Sched<[WriteVecShift]>;
- def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, i128mem:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode RC:$src1,
- (SrcVT (bitconvert (ld_frag addr:$src2))))))], itins.rm>,
- Sched<[WriteVecShiftLd, ReadAfterLd]>;
- def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
- (ins RC:$src1, u8imm:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
- Sched<[WriteVecShift]>;
-}
-
/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0, bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
+ OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -3984,9 +3997,9 @@ defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
@@ -4022,184 +4035,141 @@ defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
-// Intrinsic forms
-defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
- int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
+ VR256, loadv4i64, i256mem, SSE_PMADD,
+ 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PMADD>;
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
- loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
- memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
let Predicates = [HasAVX, NoVLX] in
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V;
let Predicates = [HasAVX2, NoVLX] in
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+ SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
- memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Logical Instructions
//===---------------------------------------------------------------------===//
-let Predicates = [HasAVX, NoVLX] in {
-defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
- VR128, v2i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-
-defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
- VR128, v2i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-
-defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-} // Predicates = [HasAVX, NoVLX]
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (SrcVT (bitconvert (ld_frag addr:$src2))))))],
+ SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
+ SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+}
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
-
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
- Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
- // 128-bit logical shifts.
- def VPSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
- VEX_4V;
- def VPSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
- VEX_4V;
- // PSRADQri doesn't exist in SSE[1-3].
-} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, ValueType DstVT128,
+ ValueType DstVT256, ValueType SrcVT,
+ Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR128, DstVT128, SrcVT,
+ loadv2i64, 0>, VEX_4V;
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR256, DstVT256, SrcVT,
+ loadv2i64, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
+ VR128, DstVT128, SrcVT, memopv2i64>;
+}
-let Predicates = [HasAVX2, NoVLX] in {
-defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
- VR256, v4i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-
-defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
- VR256, v4i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-
-defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-}// Predicates = [HasAVX2, NoVLX]
+multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode, RegisterClass RC, ValueType VT,
+ bit Is2Addr = 1> {
+ def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
+ IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
- Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- // 256-bit logical shifts.
- def VPSLLDQYri : PDIi8<0x73, MRM7r,
- (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR256:$dst,
- (v32i8 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
- VEX_4V, VEX_L;
- def VPSRLDQYri : PDIi8<0x73, MRM3r,
- (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR256:$dst,
- (v32i8 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
- VEX_4V, VEX_L;
- // PSRADQYri doesn't exist in SSE[1-3].
-} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode> {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR128, v16i8, 0>, VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR256, v32i8, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+}
-let Constraints = "$src1 = $dst" in {
-defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
- VR128, v2i64, v2i64, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
- VR128, v2i64, v2i64, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
- // 128-bit logical shifts.
- def PSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "pslldq\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_INTSHDQ_P_RI>;
- def PSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "psrldq\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_INTSHDQ_P_RI>;
+let ExeDomain = SSEPackedInt in {
+ defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ v4i32, v8i32, v4i32, NoVLX>;
+
+ defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
+ defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
// PSRADQri doesn't exist in SSE[1-3].
-}
-} // Constraints = "$src1 = $dst"
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Comparison Instructions
@@ -4651,6 +4621,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Packed Double Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -4701,11 +4672,12 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Single Scalar
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))],
@@ -4725,11 +4697,12 @@ let isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int to Packed Double Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128:$src),
@@ -4751,6 +4724,7 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
[(store (i32 (extractelt (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
@@ -4767,6 +4741,7 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
//
+let ExeDomain = SSEPackedInt in {
let SchedRW = [WriteMove] in {
def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
@@ -4791,11 +4766,12 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// Bitcast FR64 <-> GR64
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in
def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
@@ -4822,12 +4798,12 @@ let isCodeGenOnly = 1 in {
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
// Move Scalar Single to Double Int
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))],
@@ -4844,7 +4820,7 @@ let isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
let Predicates = [UseAVX] in {
let AddedComplexity = 15 in {
@@ -4867,9 +4843,13 @@ let Predicates = [UseAVX] in {
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
}
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
@@ -4892,6 +4872,8 @@ let Predicates = [UseSSE2] in {
(MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
}
}
@@ -4960,43 +4942,30 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
-//===---------------------------------------------------------------------===//
-// Store / copy lower 64-bits of a XMM register.
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
-def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))],
- IIC_SSE_MOVDQ>,
- XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
-
-def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))],
- IIC_SSE_MOVDQ>,
- XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
-} // ExeDomain, isCodeGenOnly, AddedComplexity
-
let Predicates = [UseAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (VMOVZQI2PQIrm addr:$src)>;
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVZQI2PQIrm addr:$src)>;
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
}
let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (MOVZQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -5018,24 +4987,6 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
XS, Requires<[UseSSE2]>;
} // ExeDomain, SchedRW
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
-let AddedComplexity = 20 in
-def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))],
- IIC_SSE_MOVDQ>,
- XS, VEX, Requires<[UseAVX]>;
-let AddedComplexity = 20 in {
-def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))],
- IIC_SSE_MOVDQ>,
- XS, Requires<[UseSSE2]>;
-}
-} // ExeDomain, isCodeGenOnly, SchedRW
-
let AddedComplexity = 20 in {
let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -5167,12 +5118,12 @@ let Predicates = [HasAVX] in {
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
-let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
-}
+let Predicates = [HasAVX, NoVLX] in
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+let Predicates = [HasAVX1Only] in
+def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
let Predicates = [UseSSE3] in {
def : Pat<(X86Movddup (memopv2f64 addr:$src)),
@@ -5370,35 +5321,35 @@ let Constraints = "$src1 = $dst" in {
/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
SDNode OpNode, PatFrag ld_frag> {
- def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (vt (OpNode VR128:$src)))],
- IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (OpNode VR128:$src)))],
+ IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
- def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins i128mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst,
- (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
- IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
+ IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
SDNode OpNode> {
- def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
- Sched<[WriteVecALU]>;
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+ Sched<[WriteVecALU]>;
- def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins i256mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst,
- (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
- Sched<[WriteVecALULd]>;
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+ Sched<[WriteVecALULd]>;
}
// Helper fragments to match sext vXi1 to vXiY.
@@ -5419,19 +5370,21 @@ let Predicates = [HasAVX, NoVLX] in {
defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
def : Pat<(xor
(bc_v2i64 (v16i1sextv16i8)),
(bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
- (VPABSBrr128 VR128:$src)>;
+ (VPABSBrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v8i1sextv8i16)),
(bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
- (VPABSWrr128 VR128:$src)>;
+ (VPABSWrr VR128:$src)>;
+}
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(xor
(bc_v2i64 (v4i1sextv4i32)),
(bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
- (VPABSDrr128 VR128:$src)>;
+ (VPABSDrr VR128:$src)>;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
@@ -5442,19 +5395,21 @@ let Predicates = [HasAVX2, NoVLX] in {
defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
}
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(xor
(bc_v4i64 (v32i1sextv32i8)),
(bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
- (VPABSBrr256 VR256:$src)>;
+ (VPABSBYrr VR256:$src)>;
def : Pat<(xor
(bc_v4i64 (v16i1sextv16i16)),
(bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
- (VPABSWrr256 VR256:$src)>;
+ (VPABSWYrr VR256:$src)>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(xor
(bc_v4i64 (v8i1sextv8i32)),
(bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
- (VPABSDrr256 VR256:$src)>;
+ (VPABSDYrr VR256:$src)>;
}
defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
@@ -5465,15 +5420,15 @@ let Predicates = [UseSSSE3] in {
def : Pat<(xor
(bc_v2i64 (v16i1sextv16i8)),
(bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
- (PABSBrr128 VR128:$src)>;
+ (PABSBrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v8i1sextv8i16)),
(bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
- (PABSWrr128 VR128:$src)>;
+ (PABSWrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v4i1sextv4i32)),
(bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
- (PABSDrr128 VR128:$src)>;
+ (PABSDrr VR128:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -5506,16 +5461,16 @@ def SSE_PMULHRSW : OpndItins<
/// SS3I_binop_rm - Simple SSSE3 bin op
multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, OpndItins itins,
- bit Is2Addr = 1> {
+ ValueType DstVT, ValueType OpVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
Sched<[itins.Sched]>;
def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
@@ -5523,7 +5478,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1,
+ (DstVT (OpNode (OpVT RC:$src1),
(bitconvert (memop_frag addr:$src2)))))], itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -5568,18 +5523,32 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
Sched<[Sched.Folded, ReadAfterLd]>;
}
+let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
+ VR128, loadv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
+ defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, loadv2i64, i128mem,
+ SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, loadv2i64, i128mem,
+ SSE_PMULHRSW, 0>, VEX_4V;
+}
+
let ImmT = NoImm, Predicates = [HasAVX] in {
let isCommutable = 0 in {
- defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBW, 0>, VEX_4V;
- defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBD, 0>, VEX_4V;
- defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBW, 0>, VEX_4V;
- defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBD, 0>, VEX_4V;
defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
@@ -5591,36 +5560,41 @@ let isCommutable = 0 in {
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
SSE_PSIGN, loadv2i64, 0>, VEX_4V;
- defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
- loadv2i64, i128mem,
- SSE_PSHUFB, 0>, VEX_4V;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
int_x86_ssse3_phadd_sw_128,
SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
int_x86_ssse3_phsub_sw_128,
SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
- defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128,
- SSE_PMADD, loadv2i64, 0>, VEX_4V;
}
-defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128,
- SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
+ VR256, loadv4i64, i256mem,
+ SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+ defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
+ v32i8, VR256, loadv4i64, i256mem,
+ SSE_PMADD, 0>, VEX_4V, VEX_L;
+}
+defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
+ SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
}
let ImmT = NoImm, Predicates = [HasAVX2] in {
let isCommutable = 0 in {
- defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
- loadv4i64, i256mem,
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
- loadv4i64, i256mem,
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
@@ -5629,34 +5603,25 @@ let isCommutable = 0 in {
WriteVecALU>, VEX_4V, VEX_L;
defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
WriteVecALU>, VEX_4V, VEX_L;
- defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
- loadv4i64, i256mem,
- SSE_PSHUFB, 0>, VEX_4V, VEX_L;
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
int_x86_avx2_phadd_sw,
WriteVecALU>, VEX_4V, VEX_L;
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
int_x86_avx2_phsub_sw,
WriteVecALU>, VEX_4V, VEX_L;
- defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
- int_x86_avx2_pmadd_ub_sw,
- WriteVecIMul>, VEX_4V, VEX_L;
}
-defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
- int_x86_avx2_pmul_hr_sw,
- WriteVecIMul>, VEX_4V, VEX_L;
}
// None of these have i8 immediate fields.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
- defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
memopv2i64, i128mem, SSE_PHADDSUBW>;
- defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
memopv2i64, i128mem, SSE_PHADDSUBD>;
- defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
memopv2i64, i128mem, SSE_PHADDSUBW>;
- defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
memopv2i64, i128mem, SSE_PHADDSUBD>;
defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
SSE_PSIGN, memopv2i64>;
@@ -5664,7 +5629,7 @@ let isCommutable = 0 in {
SSE_PSIGN, memopv2i64>;
defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
SSE_PSIGN, memopv2i64>;
- defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
memopv2i64, i128mem, SSE_PSHUFB>;
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
int_x86_ssse3_phadd_sw_128,
@@ -5672,13 +5637,12 @@ let isCommutable = 0 in {
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
int_x86_ssse3_phsub_sw_128,
SSE_PHADDSUBSW, memopv2i64>;
- defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128,
- SSE_PMADD, memopv2i64>;
+ defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, memopv2i64, i128mem,
+ SSE_PMADD>;
}
-defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128,
- SSE_PMULHRSW, memopv2i64>;
+defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
}
//===---------------------------------------------------------------------===//
@@ -5895,8 +5859,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
}
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5923,8 +5885,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
@@ -5941,8 +5901,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
}
}
@@ -6342,10 +6300,10 @@ let Predicates = [UseAVX] in {
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
- X86MemOperand x86memop, RegisterClass RC,
- PatFrag mem_frag32, PatFrag mem_frag64,
- Intrinsic V4F32Int, Intrinsic V2F64Int> {
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@@ -6386,24 +6344,73 @@ let ExeDomain = SSEPackedDouble in {
} // ExeDomain = SSEPackedDouble
}
-multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr,
- Intrinsic F32Int,
- Intrinsic F64Int, bit Is2Addr = 1> {
-let ExeDomain = GenericDomain in {
- // Operation, reg.
- let hasSideEffects = 0 in
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
- (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAdd]>;
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6414,8 +6421,7 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- def SSm : SS4AIi8<opcss, MRMSrcMem,
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -6426,19 +6432,6 @@ let ExeDomain = GenericDomain in {
(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
- // Operation, reg.
- let hasSideEffects = 0 in
- def SDr : SS4AIi8<opcsd, MRMSrcReg,
- (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, Sched<[WriteFAdd]>;
-
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6449,8 +6442,7 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -6460,23 +6452,24 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst,
(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX] in {
// Intrinsic form
- defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
- loadv4f32, loadv2f64,
- int_x86_sse41_round_ps,
- int_x86_sse41_round_pd>, VEX;
- defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
- loadv8f32, loadv4f64,
- int_x86_avx_round_ps_256,
- int_x86_avx_round_pd_256>, VEX, VEX_L;
- defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
- int_x86_sse41_round_ss,
- int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+ loadv4f32, loadv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+ loadv8f32, loadv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}
let Predicates = [UseAVX] in {
@@ -6548,34 +6541,37 @@ let Predicates = [HasAVX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}
-defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
- memopv4f32, memopv2f64,
- int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>;
+
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ (ROUNDSSr FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ (ROUNDSDr FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ (ROUNDSSr FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ (ROUNDSDr FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ (ROUNDSSr FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ (ROUNDSDr FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ (ROUNDSSr FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ (ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ (ROUNDSSr FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+ (ROUNDSDr FR64:$src, (i32 0xB))>;
def : Pat<(v4f32 (ffloor VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x9))>;
@@ -6867,10 +6863,10 @@ let Constraints = "$src1 = $dst" in {
let Predicates = [HasAVX, NoVLX] in {
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
- memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+ loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
VEX_4V;
defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
- memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V;
}
let Predicates = [HasAVX2] in {
@@ -7029,22 +7025,22 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_frag, Intrinsic IntId,
X86FoldableSchedWrite Sched> {
- def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+ def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
Sched<[Sched]>;
- def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+ def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
Sched<[Sched.Folded, ReadAfterLd]>;
}
@@ -7139,17 +7135,6 @@ let Predicates = [UseAVX] in {
(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
}
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0),
- (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
- sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0),
- (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
- sub_xmm)>;
-
// These will incur an FP/int domain crossing penalty, but it may be the only
// way without AVX2. Do not add any complexity because we may be able to match
// more optimal patterns defined earlier in this file.
@@ -7744,6 +7729,7 @@ defm : pclmul_alias<"lqlq", 0x00>;
let Predicates = [HasSSE4A] in {
+let ExeDomain = SSEPackedInt in {
let Constraints = "$src = $dst" in {
def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
@@ -7767,6 +7753,7 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
VR128:$mask))]>, XD;
}
+} // ExeDomain = SSEPackedInt
// Non-temporal (unaligned) scalar stores.
let AddedComplexity = 400 in { // Prefer non-temporal versions
@@ -7832,23 +7819,50 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;
+//===----------------------------------------------------------------------===//
+// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
+// halves of a 256-bit vector.
+//
let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
(ins i128mem:$src),
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteLoad]>, VEX, VEX_L;
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
(ins f128mem:$src),
- "vbroadcastf128\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst,
- (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+ "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteFShuffleLd]>, VEX, VEX_L;
-let Predicates = [HasAVX] in
-def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+}
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// VINSERTF128 - Insert packed floating-point values
@@ -7865,63 +7879,29 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
+ PatFrag memop_frag> {
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
(iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
+ (From (bitconvert (memop_frag addr:$src2))),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
+ defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
- (bc_v4i32 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
- (bc_v16i8 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
- (bc_v8i16 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>;
}
//===----------------------------------------------------------------------===//
@@ -7939,61 +7919,28 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
[]>, Sched<[WriteStore]>, VEX, VEX_L;
}
+multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
+ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (To (!cast<Instruction>(InstrStr#rr)
+ (From VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+ def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
// AVX1 patterns
let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4f32 (VEXTRACTF128rr
- (v8f32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2f64 (VEXTRACTF128rr
- (v4f64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
+ defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2i64 (VEXTRACTF128rr
- (v4i64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4i32 (VEXTRACTF128rr
- (v8i32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v8i16 (VEXTRACTF128rr
- (v16i16 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v16i8 (VEXTRACTF128rr
- (v32i8 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
}
//===----------------------------------------------------------------------===//
@@ -8239,7 +8186,7 @@ let Predicates = [HasF16C] in {
}
// Patterns for matching conversions from float to half-float and vice versa.
-let Predicates = [HasF16C] in {
+let Predicates = [HasF16C, NoVLX] in {
// Use MXCSR.RC for rounding instead of explicitly specifying the default
// rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
// configurations we support (the default). However, falling back to MXCSR is
@@ -8334,7 +8281,7 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
v2i64, v4i64, NoVLX>;
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8347,7 +8294,9 @@ let Predicates = [HasAVX2] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
+}
+let Predicates = [HasAVX2] in {
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
@@ -8361,36 +8310,38 @@ let Predicates = [HasAVX2] in {
let Predicates = [HasAVX2, NoVLX] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
- let AddedComplexity = 20 in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
(VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
(VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
(VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
- }
}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBYrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
VR128))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWYrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
VR128))>;
}
-let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in {
+let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
(VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
@@ -8418,13 +8369,13 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
-let Predicates = [HasAVX], AddedComplexity = 20 in {
+let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
-let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in {
+let Predicates = [HasAVX1Only] in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
(VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
@@ -8560,42 +8511,10 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
- (bc_v4i32 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
- (bc_v16i8 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
- (bc_v8i16 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>;
}
//===----------------------------------------------------------------------===//
@@ -8612,39 +8531,10 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
Sched<[WriteStore]>, VEX, VEX_L;
let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2i64 (VEXTRACTI128rr
- (v4i64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4i32 (VEXTRACTI128rr
- (v8i32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v8i16 (VEXTRACTI128rr
- (v16i16 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v16i8 (VEXTRACTI128rr
- (v32i8 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}
//===----------------------------------------------------------------------===//
@@ -8689,12 +8579,12 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
// masked load
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
(VT (bitconvert (ZeroVT immAllZerosV))))),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
(!cast<Instruction>(BlendStr#"rr")
RC:$src0,
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
@@ -8719,6 +8609,51 @@ let Predicates = [HasAVX2] in {
defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
}
+
+//===----------------------------------------------------------------------===//
+// SubVector Broadcasts
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2f64 VR128:$src), 1)>;
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4f32 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
//===----------------------------------------------------------------------===//
// Variable Bit Shifts
//
@@ -8758,23 +8693,35 @@ let Predicates = [HasAVX2, NoVLX] in {
defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
- let isCodeGenOnly = 1 in
- defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
+
+ def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
+ (VPSRAVDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86vsrav VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2)))),
+ (VPSRAVDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
+ (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1,
+ (bitconvert (loadv4i64 addr:$src2)))),
+ (VPSRAVDYrm VR256:$src1, addr:$src2)>;
}
+
+
+
//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations
multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
X86MemOperand memop128, X86MemOperand memop256> {
- def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+ def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX_4VOp3;
- def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+ []>, VEX;
+ def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX_4VOp3, VEX_L;
+ []>, VEX, VEX_L;
}
let mayLoad = 1, hasSideEffects = 0, Constraints
OpenPOWER on IntegriCloud