summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/README-SSE.txt
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/README-SSE.txt')
-rw-r--r--contrib/llvm/lib/Target/X86/README-SSE.txt184
1 files changed, 32 insertions, 152 deletions
diff --git a/contrib/llvm/lib/Target/X86/README-SSE.txt b/contrib/llvm/lib/Target/X86/README-SSE.txt
index e5f84e8..b6aba93 100644
--- a/contrib/llvm/lib/Target/X86/README-SSE.txt
+++ b/contrib/llvm/lib/Target/X86/README-SSE.txt
@@ -36,62 +36,6 @@ The pattern isel got this one right.
//===---------------------------------------------------------------------===//
-SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
-like this:
-
- X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time). *Note* however that this can only be done
-if Y is dead. Here's a testcase:
-
-@.str_3 = external global [15 x i8]
-declare void @printf(i32, ...)
-define void @main() {
-build_tree.exit:
- br label %no_exit.i7
-
-no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
- %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
- [ %tmp.34.i18, %no_exit.i7 ]
- %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
- [ %tmp.28.i16, %no_exit.i7 ]
- %tmp.28.i16 = fadd double %tmp.0.0.0.i10, 0.000000e+00
- %tmp.34.i18 = fadd double %tmp.0.1.0.i9, 0.000000e+00
- br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
-
-Compute_Tree.exit23: ; preds = %no_exit.i7
- tail call void (i32, ...)* @printf( i32 0 )
- store double %tmp.34.i18, double* null
- ret void
-}
-
-We currently emit:
-
-.BBmain_1:
- xorpd %XMM1, %XMM1
- addsd %XMM0, %XMM1
-*** movsd %XMM2, QWORD PTR [%ESP + 8]
-*** addsd %XMM2, %XMM1
-*** movsd QWORD PTR [%ESP + 8], %XMM2
- jmp .BBmain_1 # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
-//===---------------------------------------------------------------------===//
-
SSE should implement 'select_cc' using 'emulated conditional moves' that use
pcmp/pand/pandn/por to do a selection instead of a conditional branch:
@@ -122,12 +66,6 @@ LBB_X_2:
//===---------------------------------------------------------------------===//
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
feasible.
@@ -151,45 +89,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
//===---------------------------------------------------------------------===//
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
- movaps LCPI5_5, %xmm2
- divps %xmm1, %xmm2
- mulps %xmm2, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm2
- andps LCPI5_1, %xmm3
- por %xmm2, %xmm3
- movdqa %xmm3, (%edi)
-
- movaps LCPI5_5, %xmm1
- divps %xmm0, %xmm1
- mulps %xmm1, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm1
- andps LCPI5_1, %xmm3
- orps %xmm1, %xmm3
- movaps %xmm3, 112(%esp)
- movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
External test Nurbs exposed some problems. Look for
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
emits:
@@ -278,41 +177,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
//===---------------------------------------------------------------------===//
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4: # cond_true44
- addps %xmm1, %xmm2
- subps %xmm3, %xmm2
- movaps (%ecx), %xmm4
- movaps %xmm2, %xmm1
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
- addps %xmm2, %xmm1 <=== Commute!
- subps %xmm3, %xmm1
- movaps (%ecx), %xmm4
- movaps %xmm1, %xmm1 <=== Eliminate!
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-//===---------------------------------------------------------------------===//
-
Consider:
__m128 test(float a) {
@@ -382,22 +246,6 @@ elements are fixed zeros.
//===---------------------------------------------------------------------===//
-__m128d test1( __m128d A, __m128d B) {
- return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
This code generates ugly code, probably due to costs being off or something:
define void @test(float* %P, <4 x float>* %P2 ) {
@@ -549,6 +397,7 @@ entry:
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
ret i64 %tmp20
}
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
This currently compiles to:
@@ -987,3 +836,34 @@ This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
doing a shuffle from v[1] to v[0] then a float store.
//===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+ <2 x float> *%P) nounwind {
+ %Z = fadd <2 x float> %Q, %R
+
+ store <2 x float> %Z, <2 x float> *%P
+ ret <2 x float> %Z
+}
+
+into:
+
+_test2: ## @test2
+## BB#0:
+ insertps $0, %xmm2, %xmm2
+ insertps $16, %xmm3, %xmm2
+ insertps $0, %xmm0, %xmm3
+ insertps $16, %xmm1, %xmm3
+ addps %xmm2, %xmm3
+ movq %xmm3, (%rdi)
+ movaps %xmm3, %xmm0
+ pshufd $1, %xmm3, %xmm1
+ ## kill: XMM1<def> XMM1<kill>
+ ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+
+
OpenPOWER on IntegriCloud