diff options
-rw-r--r-- | CMakeLists.txt | 26 | ||||
-rw-r--r-- | src/sse_win64.s | 828 |
2 files changed, 0 insertions, 854 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index e96218b..1393689 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,32 +226,6 @@ elseif(HAVE_XMMINTRIN_H) ) if(MSVC) - if(NOT ENABLE_RUNTIME_DYNAMIC_CODE) - # YASM supports x86 GAS syntax - set(CMAKE_ASM-ATT_COMPILER yasm) - enable_language(ASM-ATT) - - if(CMAKE_ASM-ATT_COMPILER_WORKS) - add_custom_command( - OUTPUT sse_win64.obj - COMMAND ${CMAKE_ASM-ATT_COMPILER} -f win64 -m amd64 - -o ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj -p gas - ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Generating sse_win64.obj" - ) - - list(APPEND FFTS_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj - src/sse_win64.s - ) - else() - message(WARNING "YASM is required, enabling runtime dynamic code.") - set(ENABLE_RUNTIME_DYNAMIC_CODE ON) - endif(CMAKE_ASM-ATT_COMPILER_WORKS) - endif(NOT ENABLE_RUNTIME_DYNAMIC_CODE) - if(ENABLE_RUNTIME_DYNAMIC_CODE) add_definitions(-DSSE_DEFINE_CONSTANTS) endif(ENABLE_RUNTIME_DYNAMIC_CODE) diff --git a/src/sse_win64.s b/src/sse_win64.s deleted file mode 100644 index 193dedd..0000000 --- a/src/sse_win64.s +++ /dev/null @@ -1,828 +0,0 @@ -/* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - .code64 - -#ifdef __APPLE__ - .globl _leaf_ee_init -_leaf_ee_init: -#else - .globl leaf_ee_init -leaf_ee_init: -#endif - -# rax is loop counter (init to 0) -# rcx is a pointer to the ffts_plan -# rdx is 'in' base pointer -# rbx is loop max count -# rsi is constants pointer -# r9 is offsets pointer -# r8 is 'out' base pointer -# scratch: rax r10 r11 - - xorl %eax, %eax - movq (%rcx), %r9 - movq 0xe0(%rcx), %rsi - -# _leaf_ee + 8 needs 16 byte alignment -#ifdef __APPLE__ - .globl _leaf_ee -_leaf_ee: -#else - .globl leaf_ee -leaf_ee: -#endif - movaps 32(%rsi), %xmm0 #83.5 - movaps (%rsi), %xmm8 #83.5 -LEAF_EE_1: -LEAF_EE_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm7 #83.5 -LEAF_EE_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm12 #83.5 - movaps %xmm7, %xmm6 #83.5 -LEAF_EE_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5 - movaps %xmm12, %xmm11 #83.5 - subps %xmm10, %xmm12 #83.5 - addps %xmm10, %xmm11 #83.5 - xorps %xmm8, %xmm12 #83.5 -LEAF_EE_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm9 #83.5 -LEAF_EE_const_4: - movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5 - addps %xmm9, %xmm6 #83.5 - subps %xmm9, %xmm7 #83.5 -LEAF_EE_const_5: - movaps 0xFECA(%rdx,%rax,4), %xmm13 #83.5 - movaps %xmm10, %xmm9 #83.5 -LEAF_EE_const_6: - movaps 0xFECA(%rdx,%rax,4), %xmm3 #83.5 - movaps %xmm6, %xmm5 #83.5 -LEAF_EE_const_7: - movaps 0xFECA(%rdx,%rax,4), %xmm14 #83.5 - movaps %xmm3, %xmm15 #83.5 - shufps $177, %xmm12, %xmm12 #83.5 - movaps %xmm7, %xmm4 #83.5 - movslq (%r9, %rax, 4), %r10 #83.44 - subps %xmm13, %xmm10 #83.5 - subps %xmm14, %xmm3 #83.5 - addps %xmm11, %xmm5 #83.5 - subps %xmm11, %xmm6 #83.5 - subps %xmm12, %xmm4 #83.5 - addps %xmm12, %xmm7 #83.5 - addps %xmm13, %xmm9 #83.5 - addps %xmm14, %xmm15 #83.5 - movaps 16(%rsi), %xmm12 #83.5 - movaps %xmm9, %xmm1 #83.5 - movaps 16(%rsi), %xmm11 #83.5 - movaps %xmm5, %xmm2 #83.5 - mulps %xmm10, %xmm12 #83.5 - subps %xmm15, %xmm9 #83.5 - addps %xmm15, %xmm1 #83.5 - mulps %xmm3, %xmm11 #83.5 - addps %xmm1, %xmm2 #83.5 - subps %xmm1, %xmm5 #83.5 - shufps $177, %xmm10, %xmm10 #83.5 - xorps %xmm8, %xmm9 #83.5 - shufps $177, %xmm3, %xmm3 #83.5 - movaps %xmm6, %xmm1 #83.5 - mulps %xmm0, %xmm10 #83.5 - movaps %xmm4, %xmm13 #83.5 - mulps %xmm0, %xmm3 #83.5 - subps %xmm10, %xmm12 #83.5 - addps %xmm3, %xmm11 #83.5 - movaps %xmm12, %xmm3 #83.5 - movaps %xmm7, %xmm14 #83.5 - shufps $177, %xmm9, %xmm9 #83.5 - subps %xmm11, %xmm12 #83.5 - addps %xmm11, %xmm3 #83.5 - subps %xmm9, %xmm1 #83.5 - addps %xmm9, %xmm6 #83.5 - addps %xmm3, %xmm4 #83.5 - subps %xmm3, %xmm13 #83.5 - xorps %xmm8, %xmm12 #83.5 - movaps %xmm2, %xmm3 #83.5 - shufps $177, %xmm12, %xmm12 #83.5 - movaps %xmm6, %xmm9 #83.5 - movslq 8(%r9, %rax, 4), %r11 #83.59 - movlhps %xmm4, %xmm3 #83.5 - addq $4, %rax - shufps $238, %xmm4, %xmm2 #83.5 - movaps %xmm1, %xmm4 #83.5 - subps %xmm12, %xmm7 #83.5 - addps %xmm12, %xmm14 #83.5 - movlhps %xmm7, %xmm4 #83.5 - shufps $238, %xmm7, %xmm1 #83.5 - movaps %xmm5, %xmm7 #83.5 - movlhps %xmm13, %xmm7 #83.5 - movlhps %xmm14, %xmm9 #83.5 - shufps $238, %xmm13, %xmm5 #83.5 - shufps $238, %xmm14, %xmm6 #83.5 - movaps %xmm3, (%r8,%r10,4) #83.5 - movaps %xmm4, 16(%r8,%r10,4) #83.5 - movaps %xmm7, 32(%r8,%r10,4) #83.5 - movaps %xmm9, 48(%r8,%r10,4) #83.5 - movaps %xmm2, (%r8,%r11,4) #83.5 - movaps %xmm1, 16(%r8,%r11,4) #83.5 - movaps %xmm5, 32(%r8,%r11,4) #83.5 - movaps %xmm6, 48(%r8,%r11,4) #83.5 - cmpq %rbx, %rax - jne LEAF_EE_1 - -# _leaf_oo + 3 needs to be 16 byte aligned -#ifdef __APPLE__ - .globl _leaf_oo -_leaf_oo: -#else - .globl leaf_oo -leaf_oo: -#endif - movaps (%rsi), %xmm5 #92.7 -LEAF_OO_1: -LEAF_OO_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm4 #93.5 - movaps %xmm4, %xmm6 #93.5 -LEAF_OO_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm7 #93.5 -LEAF_OO_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm10 #93.5 - addps %xmm7, %xmm6 #93.5 - subps %xmm7, %xmm4 #93.5 -LEAF_OO_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm8 #93.5 - movaps %xmm10, %xmm9 #93.5 -LEAF_OO_const_4: - movaps 0xFECA(%rdx,%rax,4), %xmm1 #93.5 - movaps %xmm6, %xmm3 #93.5 -LEAF_OO_const_5: - movaps 0xFECA(%rdx,%rax,4), %xmm11 #93.5 - movaps %xmm1, %xmm2 #93.5 -LEAF_OO_const_6: - movaps 0xFECA(%rdx,%rax,4), %xmm14 #93.5 - movaps %xmm4, %xmm15 #93.5 -LEAF_OO_const_7: - movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5 - movaps %xmm14, %xmm13 #93.5 - movslq (%r9, %rax, 4), %r10 #83.44 - subps %xmm8, %xmm10 #93.5 - addps %xmm8, %xmm9 #93.5 - addps %xmm11, %xmm2 #93.5 - subps %xmm12, %xmm14 #93.5 - subps %xmm11, %xmm1 #93.5 - addps %xmm12, %xmm13 #93.5 - addps %xmm9, %xmm3 #93.5 - subps %xmm9, %xmm6 #93.5 - xorps %xmm5, %xmm10 #93.5 - xorps %xmm5, %xmm14 #93.5 - shufps $177, %xmm10, %xmm10 #93.5 - movaps %xmm2, %xmm9 #93.5 - shufps $177, %xmm14, %xmm14 #93.5 - movaps %xmm6, %xmm7 #93.5 - movslq 8(%r9, %rax, 4), %r11 #83.59 - addq $4, %rax #92.18 - addps %xmm10, %xmm4 #93.5 - addps %xmm13, %xmm9 #93.5 - subps %xmm13, %xmm2 #93.5 - subps %xmm10, %xmm15 #93.5 - movaps %xmm1, %xmm13 #93.5 - movaps %xmm2, %xmm8 #93.5 - movlhps %xmm4, %xmm7 #93.5 - subps %xmm14, %xmm13 #93.5 - addps %xmm14, %xmm1 #93.5 - shufps $238, %xmm4, %xmm6 #93.5 - movaps %xmm3, %xmm14 #93.5 - movaps %xmm9, %xmm4 #93.5 - movlhps %xmm15, %xmm14 #93.5 - movlhps %xmm13, %xmm4 #93.5 - movlhps %xmm1, %xmm8 #93.5 - shufps $238, %xmm15, %xmm3 #93.5 - shufps $238, %xmm13, %xmm9 #93.5 - shufps $238, %xmm1, %xmm2 #93.5 - movaps %xmm14, (%r8,%r10,4) #93.5 - movaps %xmm7, 16(%r8,%r10,4) #93.5 - movaps %xmm4, 32(%r8,%r10,4) #93.5 - movaps %xmm8, 48(%r8,%r10,4) #93.5 - movaps %xmm3, (%r8,%r11,4) #93.5 - movaps %xmm6, 16(%r8,%r11,4) #93.5 - movaps %xmm9, 32(%r8,%r11,4) #93.5 - movaps %xmm2, 48(%r8,%r11,4) #93.5 - cmpq %rbx, %rax - jne LEAF_OO_1 # Prob 95% #92.14 - -#ifdef __APPLE__ - .globl _leaf_eo -_leaf_eo: -#else - .globl leaf_eo -leaf_eo: -#endif -LEAF_EO_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm9 #88.5 -LEAF_EO_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm7 #88.5 - movaps %xmm9, %xmm11 #88.5 -LEAF_EO_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm5 #88.5 - movaps %xmm7, %xmm6 #88.5 -LEAF_EO_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5 - subps %xmm5, %xmm7 #88.5 - addps %xmm4, %xmm11 #88.5 - subps %xmm4, %xmm9 #88.5 - addps %xmm5, %xmm6 #88.5 - movaps (%rsi), %xmm3 #88.5 - movaps %xmm11, %xmm10 #88.5 - xorps %xmm3, %xmm7 #88.5 - movaps %xmm9, %xmm8 #88.5 - shufps $177, %xmm7, %xmm7 #88.5 - addps %xmm6, %xmm10 #88.5 - subps %xmm6, %xmm11 #88.5 - subps %xmm7, %xmm8 #88.5 - addps %xmm7, %xmm9 #88.5 - movslq 8(%r9, %rax, 4), %r11 #83.59 - movaps %xmm10, %xmm2 #88.5 - movslq (%r9, %rax, 4), %r10 #83.44 - movaps %xmm11, %xmm1 #88.5 - shufps $238, %xmm8, %xmm10 #88.5 - shufps $238, %xmm9, %xmm11 #88.5 - movaps %xmm10, (%r8,%r11,4) #88.5 - movaps %xmm11, 16(%r8,%r11,4) #88.5 -LEAF_EO_const_4: - movaps 0xFECA(%rdx,%rax,4), %xmm15 #88.5 -LEAF_EO_const_5: - movaps 0xFECA(%rdx,%rax,4), %xmm12 #88.5 - movaps %xmm15, %xmm14 #88.5 -LEAF_EO_const_6: - movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5 - addps %xmm12, %xmm14 #88.5 - subps %xmm12, %xmm15 #88.5 -LEAF_EO_const_7: - movaps 0xFECA(%rdx,%rax,4), %xmm13 #88.5 - movaps %xmm4, %xmm5 #88.5 - movaps %xmm14, %xmm7 #88.5 - addps %xmm13, %xmm5 #88.5 - subps %xmm13, %xmm4 #88.5 - movlhps %xmm8, %xmm2 #88.5 - movaps %xmm5, %xmm8 #88.5 - movlhps %xmm15, %xmm7 #88.5 - xorps %xmm3, %xmm15 #88.5 - movaps %xmm5, %xmm6 #88.5 - subps %xmm14, %xmm5 #88.5 - addps %xmm14, %xmm6 #88.5 - movlhps %xmm9, %xmm1 #88.5 - movaps %xmm4, %xmm14 #88.5 - movlhps %xmm4, %xmm8 #88.5 - movaps %xmm1, %xmm12 #88.5 - shufps $177, %xmm15, %xmm15 #88.5 - movaps 0x30(%rsi), %xmm11 #88.5 - addq $4, %rax #90.5 - subps %xmm15, %xmm14 #88.5 - mulps %xmm7, %xmm11 #88.5 - addps %xmm15, %xmm4 #88.5 - movaps 0x30(%rsi), %xmm9 #88.5 - movaps 0x40(%rsi), %xmm15 #88.5 - shufps $177, %xmm7, %xmm7 #88.5 - mulps %xmm8, %xmm9 #88.5 - mulps %xmm15, %xmm7 #88.5 - shufps $177, %xmm8, %xmm8 #88.5 - subps %xmm7, %xmm11 #88.5 - mulps %xmm15, %xmm8 #88.5 - movaps %xmm11, %xmm10 #88.5 - addps %xmm8, %xmm9 #88.5 - shufps $238, %xmm14, %xmm6 #88.5 - subps %xmm9, %xmm11 #88.5 - addps %xmm9, %xmm10 #88.5 - xorps %xmm3, %xmm11 #88.5 - movaps %xmm2, %xmm3 #88.5 - shufps $177, %xmm11, %xmm11 #88.5 - subps %xmm10, %xmm3 #88.5 - addps %xmm10, %xmm2 #88.5 - addps %xmm11, %xmm12 #88.5 - subps %xmm11, %xmm1 #88.5 - shufps $238, %xmm4, %xmm5 #88.5 - movaps %xmm5, 48(%r8,%r11,4) #88.5 - movaps %xmm6, 32(%r8,%r11,4) #88.5 - movaps %xmm2, (%r8,%r10,4) #88.5 - movaps %xmm1, 16(%r8,%r10,4) #88.5 - movaps %xmm3, 32(%r8,%r10,4) #88.5 - movaps %xmm12, 48(%r8,%r10,4) #88.5 - -#ifdef __APPLE__ - .globl _leaf_oe -_leaf_oe: -#else - .globl leaf_oe -leaf_oe: -#endif - movaps (%rsi), %xmm0 #59.5 -LEAF_OE_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm6 #70.5 -LEAF_OE_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm8 #70.5 - movaps %xmm6, %xmm10 #70.5 - shufps $228, %xmm8, %xmm10 #70.5 - movaps %xmm10, %xmm9 #70.5 - shufps $228, %xmm6, %xmm8 #70.5 -LEAF_OE_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm12 #70.5 -LEAF_OE_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5 - movaps %xmm12, %xmm14 #70.5 - movslq (%r9, %rax, 4), %r10 #83.44 - addps %xmm8, %xmm9 #70.5 - subps %xmm8, %xmm10 #70.5 - addps %xmm7, %xmm14 #70.5 - subps %xmm7, %xmm12 #70.5 - movaps %xmm9, %xmm4 #70.5 - movaps %xmm14, %xmm13 #70.5 - shufps $238, %xmm10, %xmm4 #70.5 - xorps %xmm0, %xmm10 #70.5 - shufps $177, %xmm10, %xmm10 #70.5 - movaps %xmm12, %xmm11 #70.5 - movaps %xmm14, %xmm5 #70.5 - addps %xmm9, %xmm13 #70.5 - subps %xmm10, %xmm11 #70.5 - subps %xmm9, %xmm14 #70.5 - shufps $238, %xmm12, %xmm5 #70.5 - addps %xmm10, %xmm12 #70.5 - movslq 8(%r9, %rax, 4), %r11 #83.59 - movlhps %xmm11, %xmm13 #70.5 - movaps %xmm13, (%r8,%r10,4) #70.5 - movaps 0x30(%rsi), %xmm13 #70.5 - movlhps %xmm12, %xmm14 #70.5 - movaps 0x40(%rsi), %xmm12 #70.5 - mulps %xmm5, %xmm13 #70.5 - shufps $177, %xmm5, %xmm5 #70.5 - mulps %xmm12, %xmm5 #70.5 - movaps %xmm14, 16(%r8,%r10,4) #70.5 - subps %xmm5, %xmm13 #70.5 - movaps 0x30(%rsi), %xmm5 #70.5 - mulps %xmm4, %xmm5 #70.5 - shufps $177, %xmm4, %xmm4 #70.5 - mulps %xmm12, %xmm4 #70.5 -LEAF_OE_const_4: - movaps 0xFECA(%rdx,%rax,4), %xmm9 #70.5 - addps %xmm4, %xmm5 #70.5 -LEAF_OE_const_6: - movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5 - movaps %xmm9, %xmm3 #70.5 -LEAF_OE_const_7: - movaps 0xFECA(%rdx,%rax,4), %xmm2 #70.5 - movaps %xmm7, %xmm6 #70.5 -LEAF_OE_const_5: - movaps 0xFECA(%rdx,%rax,4), %xmm15 #70.5 - movaps %xmm13, %xmm4 #70.5 - subps %xmm2, %xmm7 #70.5 - addps %xmm15, %xmm3 #70.5 - subps %xmm15, %xmm9 #70.5 - addps %xmm2, %xmm6 #70.5 - subps %xmm5, %xmm13 #70.5 - addps %xmm5, %xmm4 #70.5 - xorps %xmm0, %xmm7 #70.5 - addq $4, %rax #72.5 - movaps %xmm3, %xmm2 #70.5 - shufps $177, %xmm7, %xmm7 #70.5 - movaps %xmm9, %xmm8 #70.5 - xorps %xmm0, %xmm13 #70.5 - addps %xmm6, %xmm2 #70.5 - subps %xmm7, %xmm8 #70.5 - subps %xmm6, %xmm3 #70.5 - addps %xmm7, %xmm9 #70.5 - movaps %xmm2, %xmm10 #70.5 - movaps %xmm3, %xmm11 #70.5 - shufps $238, %xmm8, %xmm2 #70.5 - shufps $238, %xmm9, %xmm3 #70.5 - movaps %xmm2, %xmm14 #70.5 - shufps $177, %xmm13, %xmm13 #70.5 - subps %xmm4, %xmm14 #70.5 - addps %xmm4, %xmm2 #70.5 - movaps %xmm3, %xmm4 #70.5 - subps %xmm13, %xmm3 #70.5 - addps %xmm13, %xmm4 #70.5 - movlhps %xmm8, %xmm10 #70.5 - movlhps %xmm9, %xmm11 #70.5 - movaps %xmm10, 32(%r8,%r10,4) #70.5 - movaps %xmm11, 48(%r8,%r10,4) #70.5 - movaps %xmm2, (%r8,%r11,4) #70.5 - movaps %xmm3, 16(%r8,%r11,4) #70.5 - movaps %xmm14, 32(%r8,%r11,4) #70.5 - movaps %xmm4, 48(%r8,%r11,4) #70.5 - -#ifdef __APPLE__ - .globl _leaf_end -_leaf_end: -#else - .globl leaf_end -leaf_end: -#endif - -#ifdef __APPLE__ - .globl _x_init -_x_init: -#else - .globl x_init -x_init: -#endif - movaps (%rsi), %xmm3 #34.3 - movq 0x20(%rcx), %r9 -#ifdef __APPLE__ - .globl _x4 -_x4: -#else - .globl x4 -x4: -#endif - movaps 64(%r8), %xmm0 #34.3 - movaps 96(%r8), %xmm1 #34.3 - movaps (%r8), %xmm7 #34.3 - movaps (%r9), %xmm4 #const - movaps %xmm7, %xmm9 #34.3 - movaps %xmm4, %xmm6 #34.3 - movaps 16(%r9), %xmm2 #const - mulps %xmm0, %xmm6 #34.3 - mulps %xmm1, %xmm4 #34.3 - shufps $177, %xmm0, %xmm0 #34.3 - shufps $177, %xmm1, %xmm1 #34.3 - mulps %xmm2, %xmm0 #34.3 - mulps %xmm1, %xmm2 #34.3 - subps %xmm0, %xmm6 #34.3 - addps %xmm2, %xmm4 #34.3 - movaps %xmm6, %xmm5 #34.3 - subps %xmm4, %xmm6 #34.3 - addps %xmm4, %xmm5 #34.3 - movaps 32(%r8), %xmm8 #34.3 - xorps %xmm3, %xmm6 #34.3 - shufps $177, %xmm6, %xmm6 #34.3 - movaps %xmm8, %xmm10 #34.3 - movaps 112(%r8), %xmm12 #34.3 - subps %xmm5, %xmm9 #34.3 - addps %xmm5, %xmm7 #34.3 - addps %xmm6, %xmm10 #34.3 - subps %xmm6, %xmm8 #34.3 - movaps %xmm7, (%r8) #34.3 - movaps %xmm8, 32(%r8) #34.3 - movaps %xmm9, 64(%r8) #34.3 - movaps %xmm10, 96(%r8) #34.3 - movaps 32(%r9), %xmm14 #const #34.3 - movaps 80(%r8), %xmm11 #34.3 - movaps %xmm14, %xmm0 #34.3 - movaps 48(%r9), %xmm13 #const #34.3 - mulps %xmm11, %xmm0 #34.3 - mulps %xmm12, %xmm14 #34.3 - shufps $177, %xmm11, %xmm11 #34.3 - shufps $177, %xmm12, %xmm12 #34.3 - mulps %xmm13, %xmm11 #34.3 - mulps %xmm12, %xmm13 #34.3 - subps %xmm11, %xmm0 #34.3 - addps %xmm13, %xmm14 #34.3 - movaps %xmm0, %xmm15 #34.3 - subps %xmm14, %xmm0 #34.3 - addps %xmm14, %xmm15 #34.3 - xorps %xmm3, %xmm0 #34.3 - movaps 16(%r8), %xmm1 #34.3 - movaps 48(%r8), %xmm2 #34.3 - movaps %xmm1, %xmm4 #34.3 - shufps $177, %xmm0, %xmm0 #34.3 - movaps %xmm2, %xmm5 #34.3 - addps %xmm15, %xmm1 #34.3 - subps %xmm0, %xmm2 #34.3 - subps %xmm15, %xmm4 #34.3 - addps %xmm0, %xmm5 #34.3 - movaps %xmm1, 16(%r8) #34.3 - movaps %xmm2, 48(%r8) #34.3 - movaps %xmm4, 80(%r8) #34.3 - movaps %xmm5, 112(%r8) #34.3 - ret - -# _x8_soft + 6 needs to be 16 byte aligned -#ifdef __APPLE__ - .globl _x8_soft -_x8_soft: -#else - .globl x8_soft -x8_soft: -#endif - # rax, rcx, rdx, r8, r9, r10, r11 - # rbx, rsi - - # input - movq %r9, %rax - - # output - movq %r8, %rcx - - # loop stop (output + output_stride) - leaq (%r8, %rbx), %rdx - - # 3 * output_stride - leaq (%rbx, %rbx, 2), %rsi - - # 5 * output_stride - leaq (%rbx, %rbx, 4), %r10 - - # 7 * output_stride - leaq (%rsi, %rbx, 4), %r11 - -X8_soft_loop: - # input + 0 * input_stride - movaps (%rax), %xmm9 - - # output + 2 * output_stride - movaps (%rcx, %rbx, 2), %xmm6 - - movaps %xmm9, %xmm11 - - # output + 3 * output_stride - movaps (%rcx, %rsi), %xmm7 - - # input + 1 * input_stride - movaps 16(%rax), %xmm8 - - mulps %xmm6, %xmm11 - mulps %xmm7, %xmm9 - shufps $177, %xmm6, %xmm6 - mulps %xmm8, %xmm6 - shufps $177, %xmm7, %xmm7 - subps %xmm6, %xmm11 - mulps %xmm7, %xmm8 - movaps %xmm11, %xmm10 - addps %xmm8, %xmm9 - - # input + 2 * input_stride - movaps 32(%rax), %xmm15 - - addps %xmm9, %xmm10 - subps %xmm9, %xmm11 - - # output + 0 * output_stride - movaps (%rcx), %xmm5 - - movaps %xmm15, %xmm6 - - # output + 4 * output_stride - movaps (%rcx, %rbx, 4), %xmm12 - - movaps %xmm5, %xmm2 - - # output + 6 * output_stride - movaps (%rcx, %rsi, 2), %xmm13 - - xorps %xmm3, %xmm11 #const - - # input + 3 * input_stride - movaps 48(%rax), %xmm14 - - subps %xmm10, %xmm2 - mulps %xmm12, %xmm6 - addps %xmm10, %xmm5 - mulps %xmm13, %xmm15 - - # input + 4 * input_stride - movaps 64(%rax), %xmm10 - - movaps %xmm5, %xmm0 - shufps $177, %xmm12, %xmm12 - shufps $177, %xmm13, %xmm13 - mulps %xmm14, %xmm12 - mulps %xmm13, %xmm14 - subps %xmm12, %xmm6 - addps %xmm14, %xmm15 - - # output + 5 * output_stride - movaps (%rcx, %r10), %xmm7 - - movaps %xmm10, %xmm13 - - # output + 7 * output_stride - movaps (%rcx, %r11), %xmm8 - - movaps %xmm6, %xmm12 - - # input + 5 * input_stride - movaps 80(%rax), %xmm9 - - # input + 6 * input_stride - addq $96, %rax - - mulps %xmm7, %xmm13 - subps %xmm15, %xmm6 - addps %xmm15, %xmm12 - mulps %xmm8, %xmm10 - subps %xmm12, %xmm0 - addps %xmm12, %xmm5 - shufps $177, %xmm7, %xmm7 - xorps %xmm3, %xmm6 #const - shufps $177, %xmm8, %xmm8 - movaps %xmm2, %xmm12 - mulps %xmm9, %xmm7 - mulps %xmm8, %xmm9 - subps %xmm7, %xmm13 - addps %xmm9, %xmm10 - - # output + 1 * output_stride - movaps (%rcx, %rbx), %xmm4 - - shufps $177, %xmm11, %xmm11 - movaps %xmm4, %xmm1 - shufps $177, %xmm6, %xmm6 - addps %xmm11, %xmm1 - subps %xmm11, %xmm4 - addps %xmm6, %xmm12 - subps %xmm6, %xmm2 - movaps %xmm13, %xmm11 - movaps %xmm4, %xmm14 - movaps %xmm1, %xmm6 - subps %xmm10, %xmm13 - addps %xmm10, %xmm11 - xorps %xmm3, %xmm13 #const - addps %xmm11, %xmm4 - subps %xmm11, %xmm14 - shufps $177, %xmm13, %xmm13 - - # output + 0 * output_stride - movaps %xmm5, (%rcx) - - # output + 1 * output_stride - movaps %xmm4, (%rcx, %rbx) - - # output + 2 * output_stride - movaps %xmm2, (%rcx, %rbx, 2) - - subps %xmm13, %xmm1 - addps %xmm13, %xmm6 - - # output + 3 * output_stride - movaps %xmm1, (%rcx, %rsi) - - # output + 4 * output_stride - movaps %xmm0, (%rcx, %rbx, 4) - - # output + 5 * output_stride - movaps %xmm14, (%rcx, %r10) - - # output + 6 * output_stride - movaps %xmm12, (%rcx, %rsi, 2) - - # output + 7 * output_stride - movaps %xmm6, (%rcx, %r11) - - # output + 8 * output_stride - addq $16, %rcx - - cmpq %rdx, %rcx - jne X8_soft_loop - ret - -#ifdef __APPLE__ - .globl _x8_soft_end -_x8_soft_end: -#else - .globl x8_soft_end -x8_soft_end: - -#ifdef __APPLE__ - .globl _sse_leaf_ee_offsets - .globl _sse_leaf_oo_offsets - .globl _sse_leaf_eo_offsets - .globl _sse_leaf_oe_offsets - .align 4 -_sse_leaf_ee_offsets: - .long LEAF_EE_const_0-_leaf_ee+0x4 - .long LEAF_EE_const_1-_leaf_ee+0x5 - .long LEAF_EE_const_2-_leaf_ee+0x5 - .long LEAF_EE_const_3-_leaf_ee+0x5 - .long LEAF_EE_const_4-_leaf_ee+0x5 - .long LEAF_EE_const_5-_leaf_ee+0x5 - .long LEAF_EE_const_6-_leaf_ee+0x4 - .long LEAF_EE_const_7-_leaf_ee+0x5 -_sse_leaf_oo_offsets: - .long LEAF_OO_const_0-_leaf_oo+0x4 - .long LEAF_OO_const_1-_leaf_oo+0x4 - .long LEAF_OO_const_2-_leaf_oo+0x5 - .long LEAF_OO_const_3-_leaf_oo+0x5 - .long LEAF_OO_const_4-_leaf_oo+0x4 - .long LEAF_OO_const_5-_leaf_oo+0x5 - .long LEAF_OO_const_6-_leaf_oo+0x5 - .long LEAF_OO_const_7-_leaf_oo+0x5 -_sse_leaf_eo_offsets: - .long LEAF_EO_const_0-_leaf_eo+0x5 - .long LEAF_EO_const_1-_leaf_eo+0x4 - .long LEAF_EO_const_2-_leaf_eo+0x4 - .long LEAF_EO_const_3-_leaf_eo+0x4 - .long LEAF_EO_const_4-_leaf_eo+0x5 - .long LEAF_EO_const_5-_leaf_eo+0x5 - .long LEAF_EO_const_6-_leaf_eo+0x4 - .long LEAF_EO_const_7-_leaf_eo+0x5 -_sse_leaf_oe_offsets: - .long LEAF_OE_const_0-_leaf_oe+0x5 - .long LEAF_OE_const_1-_leaf_oe+0x4 - .long LEAF_OE_const_2-_leaf_oe+0x4 - .long LEAF_OE_const_3-_leaf_oe+0x5 - .long LEAF_OE_const_4-_leaf_oe+0x5 - .long LEAF_OE_const_5-_leaf_oe+0x5 - .long LEAF_OE_const_6-_leaf_oe+0x4 - .long LEAF_OE_const_7-_leaf_oe+0x4 -#else - .globl sse_leaf_ee_offsets - .globl sse_leaf_oo_offsets - .globl sse_leaf_eo_offsets - .globl sse_leaf_oe_offsets - .align 4 -sse_leaf_ee_offsets: - .long LEAF_EE_const_0-leaf_ee+0x4 - .long LEAF_EE_const_1-leaf_ee+0x5 - .long LEAF_EE_const_2-leaf_ee+0x5 - .long LEAF_EE_const_3-leaf_ee+0x5 - .long LEAF_EE_const_4-leaf_ee+0x5 - .long LEAF_EE_const_5-leaf_ee+0x5 - .long LEAF_EE_const_6-leaf_ee+0x4 - .long LEAF_EE_const_7-leaf_ee+0x5 -sse_leaf_oo_offsets: - .long LEAF_OO_const_0-leaf_oo+0x4 - .long LEAF_OO_const_1-leaf_oo+0x4 - .long LEAF_OO_const_2-leaf_oo+0x5 - .long LEAF_OO_const_3-leaf_oo+0x5 - .long LEAF_OO_const_4-leaf_oo+0x4 - .long LEAF_OO_const_5-leaf_oo+0x5 - .long LEAF_OO_const_6-leaf_oo+0x5 - .long LEAF_OO_const_7-leaf_oo+0x5 -sse_leaf_eo_offsets: - .long LEAF_EO_const_0-leaf_eo+0x5 - .long LEAF_EO_const_1-leaf_eo+0x4 - .long LEAF_EO_const_2-leaf_eo+0x4 - .long LEAF_EO_const_3-leaf_eo+0x4 - .long LEAF_EO_const_4-leaf_eo+0x5 - .long LEAF_EO_const_5-leaf_eo+0x5 - .long LEAF_EO_const_6-leaf_eo+0x4 - .long LEAF_EO_const_7-leaf_eo+0x5 -sse_leaf_oe_offsets: - .long LEAF_OE_const_0-leaf_oe+0x5 - .long LEAF_OE_const_1-leaf_oe+0x4 - .long LEAF_OE_const_2-leaf_oe+0x4 - .long LEAF_OE_const_3-leaf_oe+0x5 - .long LEAF_OE_const_4-leaf_oe+0x5 - .long LEAF_OE_const_5-leaf_oe+0x5 - .long LEAF_OE_const_6-leaf_oe+0x4 - .long LEAF_OE_const_7-leaf_oe+0x4 -#endif - -#ifdef __APPLE__ - .data -#else - .section .data -#endif - .p2align 4 -#ifdef __APPLE__ - .globl _sse_constants -_sse_constants: -#else - .globl sse_constants -sse_constants: -#endif - .long 0x00000000,0x80000000,0x00000000,0x80000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 -#ifdef __APPLE__ - .globl _sse_constants_inv -_sse_constants_inv: -#else - .globl sse_constants_inv -sse_constants_inv: -#endif - .long 0x80000000,0x00000000,0x80000000,0x00000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 |