diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2015-03-18 14:20:19 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2015-03-18 14:20:19 +0200 |
commit | 83e81fda3974152ce5ef04a0a22f15e079cff394 (patch) | |
tree | 63313c8932569e5be0d441783b401042999ad8e7 | |
parent | deb54fd909ce5dcb2a74c33ffa05ee54500a5aa1 (diff) | |
download | ffts-83e81fda3974152ce5ef04a0a22f15e079cff394.zip ffts-83e81fda3974152ce5ef04a0a22f15e079cff394.tar.gz |
Remove unused sse.s
-rw-r--r-- | CMakeLists.txt | 10 | ||||
-rw-r--r-- | src/sse.s | 885 |
2 files changed, 0 insertions, 895 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 1393689..d83367e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,16 +224,6 @@ elseif(HAVE_XMMINTRIN_H) list(APPEND FFTS_SOURCES src/codegen_sse.h ) - - if(MSVC) - if(ENABLE_RUNTIME_DYNAMIC_CODE) - add_definitions(-DSSE_DEFINE_CONSTANTS) - endif(ENABLE_RUNTIME_DYNAMIC_CODE) - else() - list(APPEND FFTS_SOURCES - src/sse.s - ) - endif(MSVC) else() message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.") set(DISABLE_DYNAMIC_CODE ON) diff --git a/src/sse.s b/src/sse.s deleted file mode 100644 index ccdebc8..0000000 --- a/src/sse.s +++ /dev/null @@ -1,885 +0,0 @@ -/* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - .code64 - - .globl _neon_x4 - .align 4 -_neon_x4: - - .globl _neon_x8 - .align 4 -_neon_x8: - - .globl _neon_x8_t - .align 4 -_neon_x8_t: - - -#ifdef __APPLE__ - .globl _leaf_ee_init -_leaf_ee_init: -#else - .globl leaf_ee_init -leaf_ee_init: -#endif - #lea L_sse_constants(%rip), %r9 - movq (%rdi), %r8 - movq 0xe0(%rdi), %r9 - xorl %eax, %eax - -# eax is loop counter (init to 0) -# rcx is loop max count -# rsi is 'in' base pointer -# rdx is 'out' base pointer -# r8 is offsets pointer -# r9 is constants pointer -# scratch: rax r11 r12 -# .align 4, 0x90 - -# _leaf_ee + 9 needs 16 byte alignment -#ifdef __APPLE__ - .globl _leaf_ee -_leaf_ee: -#else - .globl leaf_ee -leaf_ee: -#endif - movaps 32(%r9), %xmm0 #83.5 - movaps (%r9), %xmm8 #83.5 -LEAF_EE_1: -LEAF_EE_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5 -LEAF_EE_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5 - movaps %xmm7, %xmm6 #83.5 -LEAF_EE_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 - movaps %xmm12, %xmm11 #83.5 - subps %xmm10, %xmm12 #83.5 - addps %xmm10, %xmm11 #83.5 - xorps %xmm8, %xmm12 #83.5 -LEAF_EE_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5 -LEAF_EE_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 - addps %xmm9, %xmm6 #83.5 - subps %xmm9, %xmm7 #83.5 -LEAF_EE_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5 - movaps %xmm10, %xmm9 #83.5 -LEAF_EE_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5 - movaps %xmm6, %xmm5 #83.5 -LEAF_EE_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5 - movaps %xmm3, %xmm15 #83.5 - shufps $177, %xmm12, %xmm12 #83.5 - movaps %xmm7, %xmm4 #83.5 - movslq (%r8, %rax, 4), %r11 #83.44 - subps %xmm13, %xmm10 #83.5 - subps %xmm14, %xmm3 #83.5 - addps %xmm11, %xmm5 #83.5 - subps %xmm11, %xmm6 #83.5 - subps %xmm12, %xmm4 #83.5 - addps %xmm12, %xmm7 #83.5 - addps %xmm13, %xmm9 #83.5 - addps %xmm14, %xmm15 #83.5 - movaps 16(%r9), %xmm12 #83.5 - movaps %xmm9, %xmm1 #83.5 - movaps 16(%r9), %xmm11 #83.5 - movaps %xmm5, %xmm2 #83.5 - mulps %xmm10, %xmm12 #83.5 - subps %xmm15, %xmm9 #83.5 - addps %xmm15, %xmm1 #83.5 - mulps %xmm3, %xmm11 #83.5 - addps %xmm1, %xmm2 #83.5 - subps %xmm1, %xmm5 #83.5 - shufps $177, %xmm10, %xmm10 #83.5 - xorps %xmm8, %xmm9 #83.5 - shufps $177, %xmm3, %xmm3 #83.5 - movaps %xmm6, %xmm1 #83.5 - mulps %xmm0, %xmm10 #83.5 - movaps %xmm4, %xmm13 #83.5 - mulps %xmm0, %xmm3 #83.5 - subps %xmm10, %xmm12 #83.5 - addps %xmm3, %xmm11 #83.5 - movaps %xmm12, %xmm3 #83.5 - movaps %xmm7, %xmm14 #83.5 - shufps $177, %xmm9, %xmm9 #83.5 - subps %xmm11, %xmm12 #83.5 - addps %xmm11, %xmm3 #83.5 - subps %xmm9, %xmm1 #83.5 - addps %xmm9, %xmm6 #83.5 - addps %xmm3, %xmm4 #83.5 - subps %xmm3, %xmm13 #83.5 - xorps %xmm8, %xmm12 #83.5 - movaps %xmm2, %xmm3 #83.5 - shufps $177, %xmm12, %xmm12 #83.5 - movaps %xmm6, %xmm9 #83.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 - movlhps %xmm4, %xmm3 #83.5 - addq $4, %rax - shufps $238, %xmm4, %xmm2 #83.5 - movaps %xmm1, %xmm4 #83.5 - #movntdq %xmm3, (%rdx,%r11,4) #83.5 - subps %xmm12, %xmm7 #83.5 - addps %xmm12, %xmm14 #83.5 - movlhps %xmm7, %xmm4 #83.5 - shufps $238, %xmm7, %xmm1 #83.5 - movaps %xmm5, %xmm7 #83.5 - movlhps %xmm13, %xmm7 #83.5 - movlhps %xmm14, %xmm9 #83.5 - shufps $238, %xmm13, %xmm5 #83.5 - shufps $238, %xmm14, %xmm6 #83.5 - movaps %xmm3, (%rdx,%r11,4) #83.5 - movaps %xmm4, 16(%rdx,%r11,4) #83.5 - movaps %xmm7, 32(%rdx,%r11,4) #83.5 - movaps %xmm9, 48(%rdx,%r11,4) #83.5 - movaps %xmm2, (%rdx,%r12,4) #83.5 - movaps %xmm1, 16(%rdx,%r12,4) #83.5 - movaps %xmm5, 32(%rdx,%r12,4) #83.5 - movaps %xmm6, 48(%rdx,%r12,4) #83.5 - cmpq %rcx, %rax - jne LEAF_EE_1 - -# _leaf_oo + 4 needs to be 16 byte aligned -#ifdef __APPLE__ - .globl _leaf_oo -_leaf_oo: -#else - .globl leaf_oo -leaf_oo: -#endif - movaps (%r9), %xmm5 #92.7 -LEAF_OO_1: -LEAF_OO_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5 - movaps %xmm4, %xmm6 #93.5 -LEAF_OO_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5 -LEAF_OO_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5 - addps %xmm7, %xmm6 #93.5 - subps %xmm7, %xmm4 #93.5 -LEAF_OO_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5 - movaps %xmm10, %xmm9 #93.5 -LEAF_OO_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5 - movaps %xmm6, %xmm3 #93.5 -LEAF_OO_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5 - movaps %xmm1, %xmm2 #93.5 -LEAF_OO_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5 - movaps %xmm4, %xmm15 #93.5 -LEAF_OO_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5 - movaps %xmm14, %xmm13 #93.5 - movslq (%r8, %rax, 4), %r11 #83.44 - subps %xmm8, %xmm10 #93.5 - addps %xmm8, %xmm9 #93.5 - addps %xmm11, %xmm2 #93.5 - subps %xmm12, %xmm14 #93.5 - subps %xmm11, %xmm1 #93.5 - addps %xmm12, %xmm13 #93.5 - addps %xmm9, %xmm3 #93.5 - subps %xmm9, %xmm6 #93.5 - xorps %xmm5, %xmm10 #93.5 - xorps %xmm5, %xmm14 #93.5 - shufps $177, %xmm10, %xmm10 #93.5 - movaps %xmm2, %xmm9 #93.5 - shufps $177, %xmm14, %xmm14 #93.5 - movaps %xmm6, %xmm7 #93.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 - addq $4, %rax #92.18 - addps %xmm10, %xmm4 #93.5 - addps %xmm13, %xmm9 #93.5 - subps %xmm13, %xmm2 #93.5 - subps %xmm10, %xmm15 #93.5 - movaps %xmm1, %xmm13 #93.5 - movaps %xmm2, %xmm8 #93.5 - movlhps %xmm4, %xmm7 #93.5 - subps %xmm14, %xmm13 #93.5 - addps %xmm14, %xmm1 #93.5 - shufps $238, %xmm4, %xmm6 #93.5 - movaps %xmm3, %xmm14 #93.5 - movaps %xmm9, %xmm4 #93.5 - movlhps %xmm15, %xmm14 #93.5 - movlhps %xmm13, %xmm4 #93.5 - movlhps %xmm1, %xmm8 #93.5 - shufps $238, %xmm15, %xmm3 #93.5 - shufps $238, %xmm13, %xmm9 #93.5 - shufps $238, %xmm1, %xmm2 #93.5 - movaps %xmm14, (%rdx,%r11,4) #93.5 - movaps %xmm7, 16(%rdx,%r11,4) #93.5 - movaps %xmm4, 32(%rdx,%r11,4) #93.5 - movaps %xmm8, 48(%rdx,%r11,4) #93.5 - movaps %xmm3, (%rdx,%r12,4) #93.5 - movaps %xmm6, 16(%rdx,%r12,4) #93.5 - movaps %xmm9, 32(%rdx,%r12,4) #93.5 - movaps %xmm2, 48(%rdx,%r12,4) #93.5 - cmpq %rcx, %rax - jne LEAF_OO_1 # Prob 95% #92.14 - -#ifdef __APPLE__ - .globl _leaf_eo -_leaf_eo: -#else - .globl leaf_eo -leaf_eo: -#endif -LEAF_EO_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5 -LEAF_EO_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5 - movaps %xmm9, %xmm11 #88.5 -LEAF_EO_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5 - movaps %xmm7, %xmm6 #88.5 -LEAF_EO_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 - subps %xmm5, %xmm7 #88.5 - addps %xmm4, %xmm11 #88.5 - subps %xmm4, %xmm9 #88.5 - addps %xmm5, %xmm6 #88.5 - movaps (%r9), %xmm3 #88.5 - movaps %xmm11, %xmm10 #88.5 - xorps %xmm3, %xmm7 #88.5 - movaps %xmm9, %xmm8 #88.5 - shufps $177, %xmm7, %xmm7 #88.5 - addps %xmm6, %xmm10 #88.5 - subps %xmm6, %xmm11 #88.5 - subps %xmm7, %xmm8 #88.5 - addps %xmm7, %xmm9 #88.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 - movaps %xmm10, %xmm2 #88.5 - movslq (%r8, %rax, 4), %r11 #83.44 - movaps %xmm11, %xmm1 #88.5 - shufps $238, %xmm8, %xmm10 #88.5 - shufps $238, %xmm9, %xmm11 #88.5 - movaps %xmm10, (%rdx,%r12,4) #88.5 - movaps %xmm11, 16(%rdx,%r12,4) #88.5 -LEAF_EO_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5 -LEAF_EO_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5 - movaps %xmm15, %xmm14 #88.5 -LEAF_EO_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 - addps %xmm12, %xmm14 #88.5 - subps %xmm12, %xmm15 #88.5 -LEAF_EO_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5 - movaps %xmm4, %xmm5 #88.5 - movaps %xmm14, %xmm7 #88.5 - addps %xmm13, %xmm5 #88.5 - subps %xmm13, %xmm4 #88.5 - movlhps %xmm8, %xmm2 #88.5 - movaps %xmm5, %xmm8 #88.5 - movlhps %xmm15, %xmm7 #88.5 - xorps %xmm3, %xmm15 #88.5 - movaps %xmm5, %xmm6 #88.5 - subps %xmm14, %xmm5 #88.5 - addps %xmm14, %xmm6 #88.5 - movlhps %xmm9, %xmm1 #88.5 - movaps %xmm4, %xmm14 #88.5 - movlhps %xmm4, %xmm8 #88.5 - movaps %xmm1, %xmm12 #88.5 - shufps $177, %xmm15, %xmm15 #88.5 - movaps 0x30(%r9), %xmm11 #88.5 - addq $4, %rax #90.5 - subps %xmm15, %xmm14 #88.5 - mulps %xmm7, %xmm11 #88.5 - addps %xmm15, %xmm4 #88.5 - movaps 0x30(%r9), %xmm9 #88.5 - movaps 0x40(%r9), %xmm15 #88.5 - shufps $177, %xmm7, %xmm7 #88.5 - mulps %xmm8, %xmm9 #88.5 - mulps %xmm15, %xmm7 #88.5 - shufps $177, %xmm8, %xmm8 #88.5 - subps %xmm7, %xmm11 #88.5 - mulps %xmm15, %xmm8 #88.5 - movaps %xmm11, %xmm10 #88.5 - addps %xmm8, %xmm9 #88.5 - shufps $238, %xmm14, %xmm6 #88.5 - subps %xmm9, %xmm11 #88.5 - addps %xmm9, %xmm10 #88.5 - xorps %xmm3, %xmm11 #88.5 - movaps %xmm2, %xmm3 #88.5 - shufps $177, %xmm11, %xmm11 #88.5 - subps %xmm10, %xmm3 #88.5 - addps %xmm10, %xmm2 #88.5 - addps %xmm11, %xmm12 #88.5 - subps %xmm11, %xmm1 #88.5 - shufps $238, %xmm4, %xmm5 #88.5 - movaps %xmm5, 48(%rdx,%r12,4) #88.5 - movaps %xmm6, 32(%rdx,%r12,4) #88.5 - movaps %xmm2, (%rdx,%r11,4) #88.5 - movaps %xmm1, 16(%rdx,%r11,4) #88.5 - movaps %xmm3, 32(%rdx,%r11,4) #88.5 - movaps %xmm12, 48(%rdx,%r11,4) #88.5 - -#ifdef __APPLE__ - .globl _leaf_oe -_leaf_oe: -#else - .globl leaf_oe -leaf_oe: -#endif - movaps (%r9), %xmm0 #59.5 - #movaps 0x20(%r9), %xmm1 #59.5 -LEAF_OE_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5 -LEAF_OE_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5 - movaps %xmm6, %xmm10 #70.5 - shufps $228, %xmm8, %xmm10 #70.5 - movaps %xmm10, %xmm9 #70.5 - shufps $228, %xmm6, %xmm8 #70.5 -LEAF_OE_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5 -LEAF_OE_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 - movaps %xmm12, %xmm14 #70.5 - movslq (%r8, %rax, 4), %r11 #83.44 - addps %xmm8, %xmm9 #70.5 - subps %xmm8, %xmm10 #70.5 - addps %xmm7, %xmm14 #70.5 - subps %xmm7, %xmm12 #70.5 - movaps %xmm9, %xmm4 #70.5 - movaps %xmm14, %xmm13 #70.5 - shufps $238, %xmm10, %xmm4 #70.5 - xorps %xmm0, %xmm10 #70.5 - shufps $177, %xmm10, %xmm10 #70.5 - movaps %xmm12, %xmm11 #70.5 - movaps %xmm14, %xmm5 #70.5 - addps %xmm9, %xmm13 #70.5 - subps %xmm10, %xmm11 #70.5 - subps %xmm9, %xmm14 #70.5 - shufps $238, %xmm12, %xmm5 #70.5 - addps %xmm10, %xmm12 #70.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 - movlhps %xmm11, %xmm13 #70.5 - movaps %xmm13, (%rdx,%r11,4) #70.5 - movaps 0x30(%r9), %xmm13 #70.5 - movlhps %xmm12, %xmm14 #70.5 - movaps 0x40(%r9), %xmm12 #70.5 - mulps %xmm5, %xmm13 #70.5 - shufps $177, %xmm5, %xmm5 #70.5 - mulps %xmm12, %xmm5 #70.5 - movaps %xmm14, 16(%rdx,%r11,4) #70.5 - subps %xmm5, %xmm13 #70.5 - movaps 0x30(%r9), %xmm5 #70.5 - mulps %xmm4, %xmm5 #70.5 - shufps $177, %xmm4, %xmm4 #70.5 - mulps %xmm12, %xmm4 #70.5 -LEAF_OE_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5 - addps %xmm4, %xmm5 #70.5 -LEAF_OE_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 - movaps %xmm9, %xmm3 #70.5 -LEAF_OE_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5 - movaps %xmm7, %xmm6 #70.5 -LEAF_OE_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5 - movaps %xmm13, %xmm4 #70.5 - subps %xmm2, %xmm7 #70.5 - addps %xmm15, %xmm3 #70.5 - subps %xmm15, %xmm9 #70.5 - addps %xmm2, %xmm6 #70.5 - subps %xmm5, %xmm13 #70.5 - addps %xmm5, %xmm4 #70.5 - xorps %xmm0, %xmm7 #70.5 - addq $4, %rax #72.5 - movaps %xmm3, %xmm2 #70.5 - shufps $177, %xmm7, %xmm7 #70.5 - movaps %xmm9, %xmm8 #70.5 - xorps %xmm0, %xmm13 #70.5 - addps %xmm6, %xmm2 #70.5 - subps %xmm7, %xmm8 #70.5 - subps %xmm6, %xmm3 #70.5 - addps %xmm7, %xmm9 #70.5 - movaps %xmm2, %xmm10 #70.5 - movaps %xmm3, %xmm11 #70.5 - shufps $238, %xmm8, %xmm2 #70.5 - shufps $238, %xmm9, %xmm3 #70.5 - movaps %xmm2, %xmm14 #70.5 - shufps $177, %xmm13, %xmm13 #70.5 - subps %xmm4, %xmm14 #70.5 - addps %xmm4, %xmm2 #70.5 - movaps %xmm3, %xmm4 #70.5 - subps %xmm13, %xmm3 #70.5 - addps %xmm13, %xmm4 #70.5 - movlhps %xmm8, %xmm10 #70.5 - movlhps %xmm9, %xmm11 #70.5 - movaps %xmm10, 32(%rdx,%r11,4) #70.5 - movaps %xmm11, 48(%rdx,%r11,4) #70.5 - movaps %xmm2, (%rdx,%r12,4) #70.5 - movaps %xmm3, 16(%rdx,%r12,4) #70.5 - movaps %xmm14, 32(%rdx,%r12,4) #70.5 - movaps %xmm4, 48(%rdx,%r12,4) #70.5 - -#ifdef __APPLE__ - .globl _leaf_end -_leaf_end: -#else - .globl leaf_end -leaf_end: -#endif - -#ifdef __APPLE__ - .globl _x_init -_x_init: -#else - .globl x_init -x_init: -#endif - #movaps L_sse_constants(%rip), %xmm3 #34.3 - movaps (%r9), %xmm3 #34.3 - movq 0x20(%rdi), %r8 -#ifdef __APPLE__ - .globl _x4 -_x4: -#else - .globl x4 -x4: -#endif - movaps 64(%rdx), %xmm0 #34.3 - movaps 96(%rdx), %xmm1 #34.3 - movaps (%rdx), %xmm7 #34.3 - movaps (%r8), %xmm4 #const - movaps %xmm7, %xmm9 #34.3 - movaps %xmm4, %xmm6 #34.3 - movaps 16(%r8), %xmm2 #const - mulps %xmm0, %xmm6 #34.3 - mulps %xmm1, %xmm4 #34.3 - shufps $177, %xmm0, %xmm0 #34.3 - shufps $177, %xmm1, %xmm1 #34.3 - mulps %xmm2, %xmm0 #34.3 - mulps %xmm1, %xmm2 #34.3 - subps %xmm0, %xmm6 #34.3 - addps %xmm2, %xmm4 #34.3 - movaps %xmm6, %xmm5 #34.3 - subps %xmm4, %xmm6 #34.3 - addps %xmm4, %xmm5 #34.3 - movaps 32(%rdx), %xmm8 #34.3 - xorps %xmm3, %xmm6 #34.3 - shufps $177, %xmm6, %xmm6 #34.3 - movaps %xmm8, %xmm10 #34.3 - movaps 112(%rdx), %xmm12 #34.3 - subps %xmm5, %xmm9 #34.3 - addps %xmm5, %xmm7 #34.3 - addps %xmm6, %xmm10 #34.3 - subps %xmm6, %xmm8 #34.3 - movaps %xmm7, (%rdx) #34.3 - movaps %xmm8, 32(%rdx) #34.3 - movaps %xmm9, 64(%rdx) #34.3 - movaps %xmm10, 96(%rdx) #34.3 - movaps 32(%r8), %xmm14 #const #34.3 - movaps 80(%rdx), %xmm11 #34.3 - movaps %xmm14, %xmm0 #34.3 - movaps 48(%r8), %xmm13 #const #34.3 - mulps %xmm11, %xmm0 #34.3 - mulps %xmm12, %xmm14 #34.3 - shufps $177, %xmm11, %xmm11 #34.3 - shufps $177, %xmm12, %xmm12 #34.3 - mulps %xmm13, %xmm11 #34.3 - mulps %xmm12, %xmm13 #34.3 - subps %xmm11, %xmm0 #34.3 - addps %xmm13, %xmm14 #34.3 - movaps %xmm0, %xmm15 #34.3 - subps %xmm14, %xmm0 #34.3 - addps %xmm14, %xmm15 #34.3 - xorps %xmm3, %xmm0 #34.3 - movaps 16(%rdx), %xmm1 #34.3 - movaps 48(%rdx), %xmm2 #34.3 - movaps %xmm1, %xmm4 #34.3 - shufps $177, %xmm0, %xmm0 #34.3 - movaps %xmm2, %xmm5 #34.3 - addps %xmm15, %xmm1 #34.3 - subps %xmm0, %xmm2 #34.3 - subps %xmm15, %xmm4 #34.3 - addps %xmm0, %xmm5 #34.3 - movaps %xmm1, 16(%rdx) #34.3 - movaps %xmm2, 48(%rdx) #34.3 - movaps %xmm4, 80(%rdx) #34.3 - movaps %xmm5, 112(%rdx) #34.3 - ret - -# _x8_soft + 5 needs to be 16 byte aligned -#ifdef __APPLE__ - .globl _x8_soft -_x8_soft: -#else - .globl x8_soft -x8_soft: -#endif - xorl %eax, %eax - movq %rdx, %rbx - movq %r8, %rsi - leaq (%rdx,%rcx,4), %r9 - leaq (%r9,%rcx,4), %r10 - leaq (%r10,%rcx,4), %r11 - leaq (%r11,%rcx,4), %r12 - leaq (%r12,%rcx,4), %r13 - leaq (%r13,%rcx,4), %r14 - leaq (%r14,%rcx,4), %r15 -X8_soft_loop: - movaps (%rsi), %xmm9 - movaps (%r10,%rax,4), %xmm6 - movaps %xmm9, %xmm11 - movaps (%r11,%rax,4), %xmm7 - movaps 16(%rsi), %xmm8 - mulps %xmm6, %xmm11 - mulps %xmm7, %xmm9 - shufps $177, %xmm6, %xmm6 - mulps %xmm8, %xmm6 - shufps $177, %xmm7, %xmm7 - subps %xmm6, %xmm11 - mulps %xmm7, %xmm8 - movaps %xmm11, %xmm10 - addps %xmm8, %xmm9 - movaps 32(%rsi), %xmm15 - addps %xmm9, %xmm10 - subps %xmm9, %xmm11 - movaps (%rbx,%rax,4), %xmm5 - movaps %xmm15, %xmm6 - movaps (%r12,%rax,4), %xmm12 - movaps %xmm5, %xmm2 - movaps (%r14,%rax,4), %xmm13 - xorps %xmm3, %xmm11 #const - movaps 48(%rsi), %xmm14 - subps %xmm10, %xmm2 - mulps %xmm12, %xmm6 - addps %xmm10, %xmm5 - mulps %xmm13, %xmm15 - movaps 64(%rsi), %xmm10 - movaps %xmm5, %xmm0 - shufps $177, %xmm12, %xmm12 - shufps $177, %xmm13, %xmm13 - mulps %xmm14, %xmm12 - mulps %xmm13, %xmm14 - subps %xmm12, %xmm6 - addps %xmm14, %xmm15 - movaps (%r13,%rax,4), %xmm7 - movaps %xmm10, %xmm13 - movaps (%r15,%rax,4), %xmm8 - movaps %xmm6, %xmm12 - movaps 80(%rsi), %xmm9 - addq $96, %rsi - mulps %xmm7, %xmm13 - subps %xmm15, %xmm6 - addps %xmm15, %xmm12 - mulps %xmm8, %xmm10 - subps %xmm12, %xmm0 - addps %xmm12, %xmm5 - shufps $177, %xmm7, %xmm7 - xorps %xmm3, %xmm6 #const - shufps $177, %xmm8, %xmm8 - movaps %xmm2, %xmm12 - mulps %xmm9, %xmm7 - mulps %xmm8, %xmm9 - subps %xmm7, %xmm13 - addps %xmm9, %xmm10 - movaps (%r9,%rax,4), %xmm4 - shufps $177, %xmm11, %xmm11 - movaps %xmm4, %xmm1 - shufps $177, %xmm6, %xmm6 - addps %xmm11, %xmm1 - subps %xmm11, %xmm4 - addps %xmm6, %xmm12 - subps %xmm6, %xmm2 - movaps %xmm13, %xmm11 - movaps %xmm4, %xmm14 - movaps %xmm1, %xmm6 - subps %xmm10, %xmm13 - addps %xmm10, %xmm11 - xorps %xmm3, %xmm13 #const - addps %xmm11, %xmm4 - subps %xmm11, %xmm14 - shufps $177, %xmm13, %xmm13 - movaps %xmm5, (%rbx,%rax,4) - movaps %xmm4, (%r9,%rax,4) - movaps %xmm2, (%r10,%rax,4) - subps %xmm13, %xmm1 - addps %xmm13, %xmm6 - movaps %xmm1, (%r11,%rax,4) - movaps %xmm0, (%r12,%rax,4) - movaps %xmm14, (%r13,%rax,4) - movaps %xmm12, (%r14,%rax,4) - movaps %xmm6, (%r15,%rax,4) - addq $4, %rax - cmpq %rcx, %rax - jne X8_soft_loop - ret - -#ifdef __APPLE__ - .globl _x8_soft_end -_x8_soft_end: -#else - .globl x8_soft_end -x8_soft_end: -#endif - -#ifdef __APPLE__ - .globl _x8_hard -_x8_hard: -#else - .globl x8_hard -x8_hard: -#endif - movaps (%r9), %xmm5 -X8_loop: - movaps (%r8), %xmm9 -X8_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm6 - movaps %xmm9, %xmm11 -X8_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm7 - movaps 16(%r8), %xmm8 - mulps %xmm6, %xmm11 - mulps %xmm7, %xmm9 - shufps $177, %xmm6, %xmm6 - mulps %xmm8, %xmm6 - shufps $177, %xmm7, %xmm7 - subps %xmm6, %xmm11 - mulps %xmm7, %xmm8 - movaps %xmm11, %xmm10 - addps %xmm8, %xmm9 - movaps 32(%r8), %xmm15 - addps %xmm9, %xmm10 - subps %xmm9, %xmm11 -X8_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm3 - movaps %xmm15, %xmm6 -X8_const_4: - movaps 0xFECA(%rdx,%rax,4), %xmm12 - movaps %xmm3, %xmm2 -X8_const_6: - movaps 0xFECA(%rdx,%rax,4), %xmm13 - xorps %xmm5, %xmm11 - movaps 48(%r8), %xmm14 - subps %xmm10, %xmm2 - mulps %xmm12, %xmm6 - addps %xmm10, %xmm3 - mulps %xmm13, %xmm15 - movaps 64(%r8), %xmm10 - movaps %xmm3, %xmm0 - shufps $177, %xmm12, %xmm12 - shufps $177, %xmm13, %xmm13 - mulps %xmm14, %xmm12 - mulps %xmm13, %xmm14 - subps %xmm12, %xmm6 - addps %xmm14, %xmm15 -X8_const_5: - movaps 0xFECA(%rdx,%rax,4), %xmm7 - movaps %xmm10, %xmm13 -X8_const_7: - movaps 0xFECA(%rdx,%rax,4), %xmm8 - movaps %xmm6, %xmm12 - movaps 80(%r8), %xmm9 - addq $96, %r8 - mulps %xmm7, %xmm13 - subps %xmm15, %xmm6 - addps %xmm15, %xmm12 - mulps %xmm8, %xmm10 - subps %xmm12, %xmm0 - addps %xmm12, %xmm3 - shufps $177, %xmm7, %xmm7 - xorps %xmm5, %xmm6 - shufps $177, %xmm8, %xmm8 - movaps %xmm2, %xmm12 - mulps %xmm9, %xmm7 - mulps %xmm8, %xmm9 - subps %xmm7, %xmm13 - addps %xmm9, %xmm10 -X8_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm4 - shufps $177, %xmm11, %xmm11 - movaps %xmm4, %xmm1 - shufps $177, %xmm6, %xmm6 - addps %xmm11, %xmm1 - subps %xmm11, %xmm4 - addps %xmm6, %xmm12 - subps %xmm6, %xmm2 - movaps %xmm13, %xmm11 - movaps %xmm4, %xmm14 - movaps %xmm1, %xmm6 - subps %xmm10, %xmm13 - addps %xmm10, %xmm11 - xorps %xmm5, %xmm13 - addps %xmm11, %xmm4 - subps %xmm11, %xmm14 - shufps $177, %xmm13, %xmm13 -X8_const1_0: - movaps %xmm3, 0xFECA(%rdx,%rax,4) -X8_const1_1: - movaps %xmm4, 0xFECA(%rdx,%rax,4) -X8_const1_2: - movaps %xmm2, 0xFECA(%rdx,%rax,4) - subps %xmm13, %xmm1 - addps %xmm13, %xmm6 -X8_const1_3: - movaps %xmm1, 0xFECA(%rdx,%rax,4) -X8_const1_4: - movaps %xmm0, 0xFECA(%rdx,%rax,4) -X8_const1_5: - movaps %xmm14, 0xFECA(%rdx,%rax,4) -X8_const1_6: - movaps %xmm12, 0xFECA(%rdx,%rax,4) -X8_const1_7: - movaps %xmm6, 0xFECA(%rdx,%rax,4) - addq $4, %rax - cmpq %rcx, %rax - jne X8_loop - -#ifdef __APPLE__ - .globl _sse_leaf_ee_offsets - .globl _sse_leaf_oo_offsets - .globl _sse_leaf_eo_offsets - .globl _sse_leaf_oe_offsets - .align 4 -_sse_leaf_ee_offsets: - .long LEAF_EE_const_0-_leaf_ee+0x4 - .long LEAF_EE_const_1-_leaf_ee+0x5 - .long LEAF_EE_const_2-_leaf_ee+0x5 - .long LEAF_EE_const_3-_leaf_ee+0x5 - .long LEAF_EE_const_4-_leaf_ee+0x5 - .long LEAF_EE_const_5-_leaf_ee+0x5 - .long LEAF_EE_const_6-_leaf_ee+0x4 - .long LEAF_EE_const_7-_leaf_ee+0x5 -_sse_leaf_oo_offsets: - .long LEAF_OO_const_0-_leaf_oo+0x4 - .long LEAF_OO_const_1-_leaf_oo+0x4 - .long LEAF_OO_const_2-_leaf_oo+0x5 - .long LEAF_OO_const_3-_leaf_oo+0x5 - .long LEAF_OO_const_4-_leaf_oo+0x4 - .long LEAF_OO_const_5-_leaf_oo+0x5 - .long LEAF_OO_const_6-_leaf_oo+0x5 - .long LEAF_OO_const_7-_leaf_oo+0x5 -_sse_leaf_eo_offsets: - .long LEAF_EO_const_0-_leaf_eo+0x5 - .long LEAF_EO_const_1-_leaf_eo+0x4 - .long LEAF_EO_const_2-_leaf_eo+0x4 - .long LEAF_EO_const_3-_leaf_eo+0x4 - .long LEAF_EO_const_4-_leaf_eo+0x5 - .long LEAF_EO_const_5-_leaf_eo+0x5 - .long LEAF_EO_const_6-_leaf_eo+0x4 - .long LEAF_EO_const_7-_leaf_eo+0x5 -_sse_leaf_oe_offsets: - .long LEAF_OE_const_0-_leaf_oe+0x5 - .long LEAF_OE_const_1-_leaf_oe+0x4 - .long LEAF_OE_const_2-_leaf_oe+0x4 - .long LEAF_OE_const_3-_leaf_oe+0x5 - .long LEAF_OE_const_4-_leaf_oe+0x5 - .long LEAF_OE_const_5-_leaf_oe+0x5 - .long LEAF_OE_const_6-_leaf_oe+0x4 - .long LEAF_OE_const_7-_leaf_oe+0x4 -#else - .globl sse_leaf_ee_offsets - .globl sse_leaf_oo_offsets - .globl sse_leaf_eo_offsets - .globl sse_leaf_oe_offsets - .align 4 -sse_leaf_ee_offsets: - .long LEAF_EE_const_0-leaf_ee+0x4 - .long LEAF_EE_const_1-leaf_ee+0x5 - .long LEAF_EE_const_2-leaf_ee+0x5 - .long LEAF_EE_const_3-leaf_ee+0x5 - .long LEAF_EE_const_4-leaf_ee+0x5 - .long LEAF_EE_const_5-leaf_ee+0x5 - .long LEAF_EE_const_6-leaf_ee+0x4 - .long LEAF_EE_const_7-leaf_ee+0x5 -sse_leaf_oo_offsets: - .long LEAF_OO_const_0-leaf_oo+0x4 - .long LEAF_OO_const_1-leaf_oo+0x4 - .long LEAF_OO_const_2-leaf_oo+0x5 - .long LEAF_OO_const_3-leaf_oo+0x5 - .long LEAF_OO_const_4-leaf_oo+0x4 - .long LEAF_OO_const_5-leaf_oo+0x5 - .long LEAF_OO_const_6-leaf_oo+0x5 - .long LEAF_OO_const_7-leaf_oo+0x5 -sse_leaf_eo_offsets: - .long LEAF_EO_const_0-leaf_eo+0x5 - .long LEAF_EO_const_1-leaf_eo+0x4 - .long LEAF_EO_const_2-leaf_eo+0x4 - .long LEAF_EO_const_3-leaf_eo+0x4 - .long LEAF_EO_const_4-leaf_eo+0x5 - .long LEAF_EO_const_5-leaf_eo+0x5 - .long LEAF_EO_const_6-leaf_eo+0x4 - .long LEAF_EO_const_7-leaf_eo+0x5 -sse_leaf_oe_offsets: - .long LEAF_OE_const_0-leaf_oe+0x5 - .long LEAF_OE_const_1-leaf_oe+0x4 - .long LEAF_OE_const_2-leaf_oe+0x4 - .long LEAF_OE_const_3-leaf_oe+0x5 - .long LEAF_OE_const_4-leaf_oe+0x5 - .long LEAF_OE_const_5-leaf_oe+0x5 - .long LEAF_OE_const_6-leaf_oe+0x4 - .long LEAF_OE_const_7-leaf_oe+0x4 -#endif - -#ifdef __APPLE__ - .data -#else - .section .data -#endif - .p2align 4 -#ifdef __APPLE__ - .globl _sse_constants -_sse_constants: -#else - .globl sse_constants -sse_constants: -#endif - .long 0x00000000,0x80000000,0x00000000,0x80000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 -#ifdef __APPLE__ - .globl _sse_constants_inv -_sse_constants_inv: -#else - .globl sse_constants_inv -sse_constants_inv: -#endif - .long 0x80000000,0x00000000,0x80000000,0x00000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 |