summaryrefslogtreecommitdiffstats
path: root/src/sse.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/sse.s')
-rw-r--r--src/sse.s885
1 files changed, 885 insertions, 0 deletions
diff --git a/src/sse.s b/src/sse.s
new file mode 100644
index 0000000..ccdebc8
--- /dev/null
+++ b/src/sse.s
@@ -0,0 +1,885 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+ .code64
+
+ .globl _neon_x4
+ .align 4
+_neon_x4:
+
+ .globl _neon_x8
+ .align 4
+_neon_x8:
+
+ .globl _neon_x8_t
+ .align 4
+_neon_x8_t:
+
+
+#ifdef __APPLE__
+ .globl _leaf_ee_init
+_leaf_ee_init:
+#else
+ .globl leaf_ee_init
+leaf_ee_init:
+#endif
+ #lea L_sse_constants(%rip), %r9
+ movq (%rdi), %r8
+ movq 0xe0(%rdi), %r9
+ xorl %eax, %eax
+
+# eax is loop counter (init to 0)
+# rcx is loop max count
+# rsi is 'in' base pointer
+# rdx is 'out' base pointer
+# r8 is offsets pointer
+# r9 is constants pointer
+# scratch: rax r11 r12
+# .align 4, 0x90
+
+# _leaf_ee + 9 needs 16 byte alignment
+#ifdef __APPLE__
+ .globl _leaf_ee
+_leaf_ee:
+#else
+ .globl leaf_ee
+leaf_ee:
+#endif
+ movaps 32(%r9), %xmm0 #83.5
+ movaps (%r9), %xmm8 #83.5
+LEAF_EE_1:
+LEAF_EE_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
+LEAF_EE_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
+ movaps %xmm7, %xmm6 #83.5
+LEAF_EE_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ movaps %xmm12, %xmm11 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm10, %xmm11 #83.5
+ xorps %xmm8, %xmm12 #83.5
+LEAF_EE_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
+LEAF_EE_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ addps %xmm9, %xmm6 #83.5
+ subps %xmm9, %xmm7 #83.5
+LEAF_EE_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
+ movaps %xmm10, %xmm9 #83.5
+LEAF_EE_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
+ movaps %xmm6, %xmm5 #83.5
+LEAF_EE_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
+ movaps %xmm3, %xmm15 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm7, %xmm4 #83.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ subps %xmm13, %xmm10 #83.5
+ subps %xmm14, %xmm3 #83.5
+ addps %xmm11, %xmm5 #83.5
+ subps %xmm11, %xmm6 #83.5
+ subps %xmm12, %xmm4 #83.5
+ addps %xmm12, %xmm7 #83.5
+ addps %xmm13, %xmm9 #83.5
+ addps %xmm14, %xmm15 #83.5
+ movaps 16(%r9), %xmm12 #83.5
+ movaps %xmm9, %xmm1 #83.5
+ movaps 16(%r9), %xmm11 #83.5
+ movaps %xmm5, %xmm2 #83.5
+ mulps %xmm10, %xmm12 #83.5
+ subps %xmm15, %xmm9 #83.5
+ addps %xmm15, %xmm1 #83.5
+ mulps %xmm3, %xmm11 #83.5
+ addps %xmm1, %xmm2 #83.5
+ subps %xmm1, %xmm5 #83.5
+ shufps $177, %xmm10, %xmm10 #83.5
+ xorps %xmm8, %xmm9 #83.5
+ shufps $177, %xmm3, %xmm3 #83.5
+ movaps %xmm6, %xmm1 #83.5
+ mulps %xmm0, %xmm10 #83.5
+ movaps %xmm4, %xmm13 #83.5
+ mulps %xmm0, %xmm3 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm3, %xmm11 #83.5
+ movaps %xmm12, %xmm3 #83.5
+ movaps %xmm7, %xmm14 #83.5
+ shufps $177, %xmm9, %xmm9 #83.5
+ subps %xmm11, %xmm12 #83.5
+ addps %xmm11, %xmm3 #83.5
+ subps %xmm9, %xmm1 #83.5
+ addps %xmm9, %xmm6 #83.5
+ addps %xmm3, %xmm4 #83.5
+ subps %xmm3, %xmm13 #83.5
+ xorps %xmm8, %xmm12 #83.5
+ movaps %xmm2, %xmm3 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm6, %xmm9 #83.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movlhps %xmm4, %xmm3 #83.5
+ addq $4, %rax
+ shufps $238, %xmm4, %xmm2 #83.5
+ movaps %xmm1, %xmm4 #83.5
+ #movntdq %xmm3, (%rdx,%r11,4) #83.5
+ subps %xmm12, %xmm7 #83.5
+ addps %xmm12, %xmm14 #83.5
+ movlhps %xmm7, %xmm4 #83.5
+ shufps $238, %xmm7, %xmm1 #83.5
+ movaps %xmm5, %xmm7 #83.5
+ movlhps %xmm13, %xmm7 #83.5
+ movlhps %xmm14, %xmm9 #83.5
+ shufps $238, %xmm13, %xmm5 #83.5
+ shufps $238, %xmm14, %xmm6 #83.5
+ movaps %xmm3, (%rdx,%r11,4) #83.5
+ movaps %xmm4, 16(%rdx,%r11,4) #83.5
+ movaps %xmm7, 32(%rdx,%r11,4) #83.5
+ movaps %xmm9, 48(%rdx,%r11,4) #83.5
+ movaps %xmm2, (%rdx,%r12,4) #83.5
+ movaps %xmm1, 16(%rdx,%r12,4) #83.5
+ movaps %xmm5, 32(%rdx,%r12,4) #83.5
+ movaps %xmm6, 48(%rdx,%r12,4) #83.5
+ cmpq %rcx, %rax
+ jne LEAF_EE_1
+
+# _leaf_oo + 4 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _leaf_oo
+_leaf_oo:
+#else
+ .globl leaf_oo
+leaf_oo:
+#endif
+ movaps (%r9), %xmm5 #92.7
+LEAF_OO_1:
+LEAF_OO_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
+ movaps %xmm4, %xmm6 #93.5
+LEAF_OO_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
+LEAF_OO_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
+ addps %xmm7, %xmm6 #93.5
+ subps %xmm7, %xmm4 #93.5
+LEAF_OO_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
+ movaps %xmm10, %xmm9 #93.5
+LEAF_OO_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
+ movaps %xmm6, %xmm3 #93.5
+LEAF_OO_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
+ movaps %xmm1, %xmm2 #93.5
+LEAF_OO_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
+ movaps %xmm4, %xmm15 #93.5
+LEAF_OO_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
+ movaps %xmm14, %xmm13 #93.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ subps %xmm8, %xmm10 #93.5
+ addps %xmm8, %xmm9 #93.5
+ addps %xmm11, %xmm2 #93.5
+ subps %xmm12, %xmm14 #93.5
+ subps %xmm11, %xmm1 #93.5
+ addps %xmm12, %xmm13 #93.5
+ addps %xmm9, %xmm3 #93.5
+ subps %xmm9, %xmm6 #93.5
+ xorps %xmm5, %xmm10 #93.5
+ xorps %xmm5, %xmm14 #93.5
+ shufps $177, %xmm10, %xmm10 #93.5
+ movaps %xmm2, %xmm9 #93.5
+ shufps $177, %xmm14, %xmm14 #93.5
+ movaps %xmm6, %xmm7 #93.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ addq $4, %rax #92.18
+ addps %xmm10, %xmm4 #93.5
+ addps %xmm13, %xmm9 #93.5
+ subps %xmm13, %xmm2 #93.5
+ subps %xmm10, %xmm15 #93.5
+ movaps %xmm1, %xmm13 #93.5
+ movaps %xmm2, %xmm8 #93.5
+ movlhps %xmm4, %xmm7 #93.5
+ subps %xmm14, %xmm13 #93.5
+ addps %xmm14, %xmm1 #93.5
+ shufps $238, %xmm4, %xmm6 #93.5
+ movaps %xmm3, %xmm14 #93.5
+ movaps %xmm9, %xmm4 #93.5
+ movlhps %xmm15, %xmm14 #93.5
+ movlhps %xmm13, %xmm4 #93.5
+ movlhps %xmm1, %xmm8 #93.5
+ shufps $238, %xmm15, %xmm3 #93.5
+ shufps $238, %xmm13, %xmm9 #93.5
+ shufps $238, %xmm1, %xmm2 #93.5
+ movaps %xmm14, (%rdx,%r11,4) #93.5
+ movaps %xmm7, 16(%rdx,%r11,4) #93.5
+ movaps %xmm4, 32(%rdx,%r11,4) #93.5
+ movaps %xmm8, 48(%rdx,%r11,4) #93.5
+ movaps %xmm3, (%rdx,%r12,4) #93.5
+ movaps %xmm6, 16(%rdx,%r12,4) #93.5
+ movaps %xmm9, 32(%rdx,%r12,4) #93.5
+ movaps %xmm2, 48(%rdx,%r12,4) #93.5
+ cmpq %rcx, %rax
+ jne LEAF_OO_1 # Prob 95% #92.14
+
+#ifdef __APPLE__
+ .globl _leaf_eo
+_leaf_eo:
+#else
+ .globl leaf_eo
+leaf_eo:
+#endif
+LEAF_EO_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
+LEAF_EO_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
+ movaps %xmm9, %xmm11 #88.5
+LEAF_EO_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
+ movaps %xmm7, %xmm6 #88.5
+LEAF_EO_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ subps %xmm5, %xmm7 #88.5
+ addps %xmm4, %xmm11 #88.5
+ subps %xmm4, %xmm9 #88.5
+ addps %xmm5, %xmm6 #88.5
+ movaps (%r9), %xmm3 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ xorps %xmm3, %xmm7 #88.5
+ movaps %xmm9, %xmm8 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ addps %xmm6, %xmm10 #88.5
+ subps %xmm6, %xmm11 #88.5
+ subps %xmm7, %xmm8 #88.5
+ addps %xmm7, %xmm9 #88.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movaps %xmm10, %xmm2 #88.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ movaps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm8, %xmm10 #88.5
+ shufps $238, %xmm9, %xmm11 #88.5
+ movaps %xmm10, (%rdx,%r12,4) #88.5
+ movaps %xmm11, 16(%rdx,%r12,4) #88.5
+LEAF_EO_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
+LEAF_EO_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
+ movaps %xmm15, %xmm14 #88.5
+LEAF_EO_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ addps %xmm12, %xmm14 #88.5
+ subps %xmm12, %xmm15 #88.5
+LEAF_EO_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
+ movaps %xmm4, %xmm5 #88.5
+ movaps %xmm14, %xmm7 #88.5
+ addps %xmm13, %xmm5 #88.5
+ subps %xmm13, %xmm4 #88.5
+ movlhps %xmm8, %xmm2 #88.5
+ movaps %xmm5, %xmm8 #88.5
+ movlhps %xmm15, %xmm7 #88.5
+ xorps %xmm3, %xmm15 #88.5
+ movaps %xmm5, %xmm6 #88.5
+ subps %xmm14, %xmm5 #88.5
+ addps %xmm14, %xmm6 #88.5
+ movlhps %xmm9, %xmm1 #88.5
+ movaps %xmm4, %xmm14 #88.5
+ movlhps %xmm4, %xmm8 #88.5
+ movaps %xmm1, %xmm12 #88.5
+ shufps $177, %xmm15, %xmm15 #88.5
+ movaps 0x30(%r9), %xmm11 #88.5
+ addq $4, %rax #90.5
+ subps %xmm15, %xmm14 #88.5
+ mulps %xmm7, %xmm11 #88.5
+ addps %xmm15, %xmm4 #88.5
+ movaps 0x30(%r9), %xmm9 #88.5
+ movaps 0x40(%r9), %xmm15 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ mulps %xmm8, %xmm9 #88.5
+ mulps %xmm15, %xmm7 #88.5
+ shufps $177, %xmm8, %xmm8 #88.5
+ subps %xmm7, %xmm11 #88.5
+ mulps %xmm15, %xmm8 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ addps %xmm8, %xmm9 #88.5
+ shufps $238, %xmm14, %xmm6 #88.5
+ subps %xmm9, %xmm11 #88.5
+ addps %xmm9, %xmm10 #88.5
+ xorps %xmm3, %xmm11 #88.5
+ movaps %xmm2, %xmm3 #88.5
+ shufps $177, %xmm11, %xmm11 #88.5
+ subps %xmm10, %xmm3 #88.5
+ addps %xmm10, %xmm2 #88.5
+ addps %xmm11, %xmm12 #88.5
+ subps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm4, %xmm5 #88.5
+ movaps %xmm5, 48(%rdx,%r12,4) #88.5
+ movaps %xmm6, 32(%rdx,%r12,4) #88.5
+ movaps %xmm2, (%rdx,%r11,4) #88.5
+ movaps %xmm1, 16(%rdx,%r11,4) #88.5
+ movaps %xmm3, 32(%rdx,%r11,4) #88.5
+ movaps %xmm12, 48(%rdx,%r11,4) #88.5
+
+#ifdef __APPLE__
+ .globl _leaf_oe
+_leaf_oe:
+#else
+ .globl leaf_oe
+leaf_oe:
+#endif
+ movaps (%r9), %xmm0 #59.5
+ #movaps 0x20(%r9), %xmm1 #59.5
+LEAF_OE_const_2:
+ movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
+LEAF_OE_const_3:
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
+ movaps %xmm6, %xmm10 #70.5
+ shufps $228, %xmm8, %xmm10 #70.5
+ movaps %xmm10, %xmm9 #70.5
+ shufps $228, %xmm6, %xmm8 #70.5
+LEAF_OE_const_0:
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
+LEAF_OE_const_1:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps %xmm12, %xmm14 #70.5
+ movslq (%r8, %rax, 4), %r11 #83.44
+ addps %xmm8, %xmm9 #70.5
+ subps %xmm8, %xmm10 #70.5
+ addps %xmm7, %xmm14 #70.5
+ subps %xmm7, %xmm12 #70.5
+ movaps %xmm9, %xmm4 #70.5
+ movaps %xmm14, %xmm13 #70.5
+ shufps $238, %xmm10, %xmm4 #70.5
+ xorps %xmm0, %xmm10 #70.5
+ shufps $177, %xmm10, %xmm10 #70.5
+ movaps %xmm12, %xmm11 #70.5
+ movaps %xmm14, %xmm5 #70.5
+ addps %xmm9, %xmm13 #70.5
+ subps %xmm10, %xmm11 #70.5
+ subps %xmm9, %xmm14 #70.5
+ shufps $238, %xmm12, %xmm5 #70.5
+ addps %xmm10, %xmm12 #70.5
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ movlhps %xmm11, %xmm13 #70.5
+ movaps %xmm13, (%rdx,%r11,4) #70.5
+ movaps 0x30(%r9), %xmm13 #70.5
+ movlhps %xmm12, %xmm14 #70.5
+ movaps 0x40(%r9), %xmm12 #70.5
+ mulps %xmm5, %xmm13 #70.5
+ shufps $177, %xmm5, %xmm5 #70.5
+ mulps %xmm12, %xmm5 #70.5
+ movaps %xmm14, 16(%rdx,%r11,4) #70.5
+ subps %xmm5, %xmm13 #70.5
+ movaps 0x30(%r9), %xmm5 #70.5
+ mulps %xmm4, %xmm5 #70.5
+ shufps $177, %xmm4, %xmm4 #70.5
+ mulps %xmm12, %xmm4 #70.5
+LEAF_OE_const_4:
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
+ addps %xmm4, %xmm5 #70.5
+LEAF_OE_const_6:
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps %xmm9, %xmm3 #70.5
+LEAF_OE_const_7:
+ movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
+ movaps %xmm7, %xmm6 #70.5
+LEAF_OE_const_5:
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
+ movaps %xmm13, %xmm4 #70.5
+ subps %xmm2, %xmm7 #70.5
+ addps %xmm15, %xmm3 #70.5
+ subps %xmm15, %xmm9 #70.5
+ addps %xmm2, %xmm6 #70.5
+ subps %xmm5, %xmm13 #70.5
+ addps %xmm5, %xmm4 #70.5
+ xorps %xmm0, %xmm7 #70.5
+ addq $4, %rax #72.5
+ movaps %xmm3, %xmm2 #70.5
+ shufps $177, %xmm7, %xmm7 #70.5
+ movaps %xmm9, %xmm8 #70.5
+ xorps %xmm0, %xmm13 #70.5
+ addps %xmm6, %xmm2 #70.5
+ subps %xmm7, %xmm8 #70.5
+ subps %xmm6, %xmm3 #70.5
+ addps %xmm7, %xmm9 #70.5
+ movaps %xmm2, %xmm10 #70.5
+ movaps %xmm3, %xmm11 #70.5
+ shufps $238, %xmm8, %xmm2 #70.5
+ shufps $238, %xmm9, %xmm3 #70.5
+ movaps %xmm2, %xmm14 #70.5
+ shufps $177, %xmm13, %xmm13 #70.5
+ subps %xmm4, %xmm14 #70.5
+ addps %xmm4, %xmm2 #70.5
+ movaps %xmm3, %xmm4 #70.5
+ subps %xmm13, %xmm3 #70.5
+ addps %xmm13, %xmm4 #70.5
+ movlhps %xmm8, %xmm10 #70.5
+ movlhps %xmm9, %xmm11 #70.5
+ movaps %xmm10, 32(%rdx,%r11,4) #70.5
+ movaps %xmm11, 48(%rdx,%r11,4) #70.5
+ movaps %xmm2, (%rdx,%r12,4) #70.5
+ movaps %xmm3, 16(%rdx,%r12,4) #70.5
+ movaps %xmm14, 32(%rdx,%r12,4) #70.5
+ movaps %xmm4, 48(%rdx,%r12,4) #70.5
+
+#ifdef __APPLE__
+ .globl _leaf_end
+_leaf_end:
+#else
+ .globl leaf_end
+leaf_end:
+#endif
+
+#ifdef __APPLE__
+ .globl _x_init
+_x_init:
+#else
+ .globl x_init
+x_init:
+#endif
+ #movaps L_sse_constants(%rip), %xmm3 #34.3
+ movaps (%r9), %xmm3 #34.3
+ movq 0x20(%rdi), %r8
+#ifdef __APPLE__
+ .globl _x4
+_x4:
+#else
+ .globl x4
+x4:
+#endif
+ movaps 64(%rdx), %xmm0 #34.3
+ movaps 96(%rdx), %xmm1 #34.3
+ movaps (%rdx), %xmm7 #34.3
+ movaps (%r8), %xmm4 #const
+ movaps %xmm7, %xmm9 #34.3
+ movaps %xmm4, %xmm6 #34.3
+ movaps 16(%r8), %xmm2 #const
+ mulps %xmm0, %xmm6 #34.3
+ mulps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ shufps $177, %xmm1, %xmm1 #34.3
+ mulps %xmm2, %xmm0 #34.3
+ mulps %xmm1, %xmm2 #34.3
+ subps %xmm0, %xmm6 #34.3
+ addps %xmm2, %xmm4 #34.3
+ movaps %xmm6, %xmm5 #34.3
+ subps %xmm4, %xmm6 #34.3
+ addps %xmm4, %xmm5 #34.3
+ movaps 32(%rdx), %xmm8 #34.3
+ xorps %xmm3, %xmm6 #34.3
+ shufps $177, %xmm6, %xmm6 #34.3
+ movaps %xmm8, %xmm10 #34.3
+ movaps 112(%rdx), %xmm12 #34.3
+ subps %xmm5, %xmm9 #34.3
+ addps %xmm5, %xmm7 #34.3
+ addps %xmm6, %xmm10 #34.3
+ subps %xmm6, %xmm8 #34.3
+ movaps %xmm7, (%rdx) #34.3
+ movaps %xmm8, 32(%rdx) #34.3
+ movaps %xmm9, 64(%rdx) #34.3
+ movaps %xmm10, 96(%rdx) #34.3
+ movaps 32(%r8), %xmm14 #const #34.3
+ movaps 80(%rdx), %xmm11 #34.3
+ movaps %xmm14, %xmm0 #34.3
+ movaps 48(%r8), %xmm13 #const #34.3
+ mulps %xmm11, %xmm0 #34.3
+ mulps %xmm12, %xmm14 #34.3
+ shufps $177, %xmm11, %xmm11 #34.3
+ shufps $177, %xmm12, %xmm12 #34.3
+ mulps %xmm13, %xmm11 #34.3
+ mulps %xmm12, %xmm13 #34.3
+ subps %xmm11, %xmm0 #34.3
+ addps %xmm13, %xmm14 #34.3
+ movaps %xmm0, %xmm15 #34.3
+ subps %xmm14, %xmm0 #34.3
+ addps %xmm14, %xmm15 #34.3
+ xorps %xmm3, %xmm0 #34.3
+ movaps 16(%rdx), %xmm1 #34.3
+ movaps 48(%rdx), %xmm2 #34.3
+ movaps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ movaps %xmm2, %xmm5 #34.3
+ addps %xmm15, %xmm1 #34.3
+ subps %xmm0, %xmm2 #34.3
+ subps %xmm15, %xmm4 #34.3
+ addps %xmm0, %xmm5 #34.3
+ movaps %xmm1, 16(%rdx) #34.3
+ movaps %xmm2, 48(%rdx) #34.3
+ movaps %xmm4, 80(%rdx) #34.3
+ movaps %xmm5, 112(%rdx) #34.3
+ ret
+
+# _x8_soft + 5 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _x8_soft
+_x8_soft:
+#else
+ .globl x8_soft
+x8_soft:
+#endif
+ xorl %eax, %eax
+ movq %rdx, %rbx
+ movq %r8, %rsi
+ leaq (%rdx,%rcx,4), %r9
+ leaq (%r9,%rcx,4), %r10
+ leaq (%r10,%rcx,4), %r11
+ leaq (%r11,%rcx,4), %r12
+ leaq (%r12,%rcx,4), %r13
+ leaq (%r13,%rcx,4), %r14
+ leaq (%r14,%rcx,4), %r15
+X8_soft_loop:
+ movaps (%rsi), %xmm9
+ movaps (%r10,%rax,4), %xmm6
+ movaps %xmm9, %xmm11
+ movaps (%r11,%rax,4), %xmm7
+ movaps 16(%rsi), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%rsi), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+ movaps (%rbx,%rax,4), %xmm5
+ movaps %xmm15, %xmm6
+ movaps (%r12,%rax,4), %xmm12
+ movaps %xmm5, %xmm2
+ movaps (%r14,%rax,4), %xmm13
+ xorps %xmm3, %xmm11 #const
+ movaps 48(%rsi), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm5
+ mulps %xmm13, %xmm15
+ movaps 64(%rsi), %xmm10
+ movaps %xmm5, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+ movaps (%r13,%rax,4), %xmm7
+ movaps %xmm10, %xmm13
+ movaps (%r15,%rax,4), %xmm8
+ movaps %xmm6, %xmm12
+ movaps 80(%rsi), %xmm9
+ addq $96, %rsi
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm5
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm3, %xmm6 #const
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+ movaps (%r9,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm3, %xmm13 #const
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+ movaps %xmm5, (%rbx,%rax,4)
+ movaps %xmm4, (%r9,%rax,4)
+ movaps %xmm2, (%r10,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+ movaps %xmm1, (%r11,%rax,4)
+ movaps %xmm0, (%r12,%rax,4)
+ movaps %xmm14, (%r13,%rax,4)
+ movaps %xmm12, (%r14,%rax,4)
+ movaps %xmm6, (%r15,%rax,4)
+ addq $4, %rax
+ cmpq %rcx, %rax
+ jne X8_soft_loop
+ ret
+
+#ifdef __APPLE__
+ .globl _x8_soft_end
+_x8_soft_end:
+#else
+ .globl x8_soft_end
+x8_soft_end:
+#endif
+
+#ifdef __APPLE__
+ .globl _x8_hard
+_x8_hard:
+#else
+ .globl x8_hard
+x8_hard:
+#endif
+ movaps (%r9), %xmm5
+X8_loop:
+ movaps (%r8), %xmm9
+X8_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm6
+ movaps %xmm9, %xmm11
+X8_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7
+ movaps 16(%r8), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%r8), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+X8_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm3
+ movaps %xmm15, %xmm6
+X8_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12
+ movaps %xmm3, %xmm2
+X8_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm13
+ xorps %xmm5, %xmm11
+ movaps 48(%r8), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm3
+ mulps %xmm13, %xmm15
+ movaps 64(%r8), %xmm10
+ movaps %xmm3, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+X8_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7
+ movaps %xmm10, %xmm13
+X8_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm8
+ movaps %xmm6, %xmm12
+ movaps 80(%r8), %xmm9
+ addq $96, %r8
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm3
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm5, %xmm6
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+X8_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm5, %xmm13
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+X8_const1_0:
+ movaps %xmm3, 0xFECA(%rdx,%rax,4)
+X8_const1_1:
+ movaps %xmm4, 0xFECA(%rdx,%rax,4)
+X8_const1_2:
+ movaps %xmm2, 0xFECA(%rdx,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+X8_const1_3:
+ movaps %xmm1, 0xFECA(%rdx,%rax,4)
+X8_const1_4:
+ movaps %xmm0, 0xFECA(%rdx,%rax,4)
+X8_const1_5:
+ movaps %xmm14, 0xFECA(%rdx,%rax,4)
+X8_const1_6:
+ movaps %xmm12, 0xFECA(%rdx,%rax,4)
+X8_const1_7:
+ movaps %xmm6, 0xFECA(%rdx,%rax,4)
+ addq $4, %rax
+ cmpq %rcx, %rax
+ jne X8_loop
+
+#ifdef __APPLE__
+ .globl _sse_leaf_ee_offsets
+ .globl _sse_leaf_oo_offsets
+ .globl _sse_leaf_eo_offsets
+ .globl _sse_leaf_oe_offsets
+ .align 4
+_sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-_leaf_ee+0x4
+ .long LEAF_EE_const_1-_leaf_ee+0x5
+ .long LEAF_EE_const_2-_leaf_ee+0x5
+ .long LEAF_EE_const_3-_leaf_ee+0x5
+ .long LEAF_EE_const_4-_leaf_ee+0x5
+ .long LEAF_EE_const_5-_leaf_ee+0x5
+ .long LEAF_EE_const_6-_leaf_ee+0x4
+ .long LEAF_EE_const_7-_leaf_ee+0x5
+_sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-_leaf_oo+0x4
+ .long LEAF_OO_const_1-_leaf_oo+0x4
+ .long LEAF_OO_const_2-_leaf_oo+0x5
+ .long LEAF_OO_const_3-_leaf_oo+0x5
+ .long LEAF_OO_const_4-_leaf_oo+0x4
+ .long LEAF_OO_const_5-_leaf_oo+0x5
+ .long LEAF_OO_const_6-_leaf_oo+0x5
+ .long LEAF_OO_const_7-_leaf_oo+0x5
+_sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-_leaf_eo+0x5
+ .long LEAF_EO_const_1-_leaf_eo+0x4
+ .long LEAF_EO_const_2-_leaf_eo+0x4
+ .long LEAF_EO_const_3-_leaf_eo+0x4
+ .long LEAF_EO_const_4-_leaf_eo+0x5
+ .long LEAF_EO_const_5-_leaf_eo+0x5
+ .long LEAF_EO_const_6-_leaf_eo+0x4
+ .long LEAF_EO_const_7-_leaf_eo+0x5
+_sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-_leaf_oe+0x5
+ .long LEAF_OE_const_1-_leaf_oe+0x4
+ .long LEAF_OE_const_2-_leaf_oe+0x4
+ .long LEAF_OE_const_3-_leaf_oe+0x5
+ .long LEAF_OE_const_4-_leaf_oe+0x5
+ .long LEAF_OE_const_5-_leaf_oe+0x5
+ .long LEAF_OE_const_6-_leaf_oe+0x4
+ .long LEAF_OE_const_7-_leaf_oe+0x4
+#else
+ .globl sse_leaf_ee_offsets
+ .globl sse_leaf_oo_offsets
+ .globl sse_leaf_eo_offsets
+ .globl sse_leaf_oe_offsets
+ .align 4
+sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-leaf_ee+0x4
+ .long LEAF_EE_const_1-leaf_ee+0x5
+ .long LEAF_EE_const_2-leaf_ee+0x5
+ .long LEAF_EE_const_3-leaf_ee+0x5
+ .long LEAF_EE_const_4-leaf_ee+0x5
+ .long LEAF_EE_const_5-leaf_ee+0x5
+ .long LEAF_EE_const_6-leaf_ee+0x4
+ .long LEAF_EE_const_7-leaf_ee+0x5
+sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-leaf_oo+0x4
+ .long LEAF_OO_const_1-leaf_oo+0x4
+ .long LEAF_OO_const_2-leaf_oo+0x5
+ .long LEAF_OO_const_3-leaf_oo+0x5
+ .long LEAF_OO_const_4-leaf_oo+0x4
+ .long LEAF_OO_const_5-leaf_oo+0x5
+ .long LEAF_OO_const_6-leaf_oo+0x5
+ .long LEAF_OO_const_7-leaf_oo+0x5
+sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-leaf_eo+0x5
+ .long LEAF_EO_const_1-leaf_eo+0x4
+ .long LEAF_EO_const_2-leaf_eo+0x4
+ .long LEAF_EO_const_3-leaf_eo+0x4
+ .long LEAF_EO_const_4-leaf_eo+0x5
+ .long LEAF_EO_const_5-leaf_eo+0x5
+ .long LEAF_EO_const_6-leaf_eo+0x4
+ .long LEAF_EO_const_7-leaf_eo+0x5
+sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-leaf_oe+0x5
+ .long LEAF_OE_const_1-leaf_oe+0x4
+ .long LEAF_OE_const_2-leaf_oe+0x4
+ .long LEAF_OE_const_3-leaf_oe+0x5
+ .long LEAF_OE_const_4-leaf_oe+0x5
+ .long LEAF_OE_const_5-leaf_oe+0x5
+ .long LEAF_OE_const_6-leaf_oe+0x4
+ .long LEAF_OE_const_7-leaf_oe+0x4
+#endif
+
+#ifdef __APPLE__
+ .data
+#else
+ .section .data
+#endif
+ .p2align 4
+#ifdef __APPLE__
+ .globl _sse_constants
+_sse_constants:
+#else
+ .globl sse_constants
+sse_constants:
+#endif
+ .long 0x00000000,0x80000000,0x00000000,0x80000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+ .globl _sse_constants_inv
+_sse_constants_inv:
+#else
+ .globl sse_constants_inv
+sse_constants_inv:
+#endif
+ .long 0x80000000,0x00000000,0x80000000,0x00000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
OpenPOWER on IntegriCloud