summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-03-17 14:59:12 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-03-17 14:59:12 +0200
commit8dc312e88784ef67419f16bfb86defb7f6cc71c1 (patch)
tree640ef383a11586e6e9645ed5ae5d14db971215ef
parentbc5aa8c1a2006a579b306234848d00a9ae34d362 (diff)
downloadffts-8dc312e88784ef67419f16bfb86defb7f6cc71c1.zip
ffts-8dc312e88784ef67419f16bfb86defb7f6cc71c1.tar.gz
Remove dependency on YASM as Windows dynamic code is run-time generated
-rw-r--r--CMakeLists.txt26
-rw-r--r--src/sse_win64.s828
2 files changed, 0 insertions, 854 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e96218b..1393689 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,32 +226,6 @@ elseif(HAVE_XMMINTRIN_H)
)
if(MSVC)
- if(NOT ENABLE_RUNTIME_DYNAMIC_CODE)
- # YASM supports x86 GAS syntax
- set(CMAKE_ASM-ATT_COMPILER yasm)
- enable_language(ASM-ATT)
-
- if(CMAKE_ASM-ATT_COMPILER_WORKS)
- add_custom_command(
- OUTPUT sse_win64.obj
- COMMAND ${CMAKE_ASM-ATT_COMPILER} -f win64 -m amd64
- -o ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj -p gas
- ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
- DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Generating sse_win64.obj"
- )
-
- list(APPEND FFTS_SOURCES
- ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj
- src/sse_win64.s
- )
- else()
- message(WARNING "YASM is required, enabling runtime dynamic code.")
- set(ENABLE_RUNTIME_DYNAMIC_CODE ON)
- endif(CMAKE_ASM-ATT_COMPILER_WORKS)
- endif(NOT ENABLE_RUNTIME_DYNAMIC_CODE)
-
if(ENABLE_RUNTIME_DYNAMIC_CODE)
add_definitions(-DSSE_DEFINE_CONSTANTS)
endif(ENABLE_RUNTIME_DYNAMIC_CODE)
diff --git a/src/sse_win64.s b/src/sse_win64.s
deleted file mode 100644
index 193dedd..0000000
--- a/src/sse_win64.s
+++ /dev/null
@@ -1,828 +0,0 @@
-/*
-
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
- .code64
-
-#ifdef __APPLE__
- .globl _leaf_ee_init
-_leaf_ee_init:
-#else
- .globl leaf_ee_init
-leaf_ee_init:
-#endif
-
-# rax is loop counter (init to 0)
-# rcx is a pointer to the ffts_plan
-# rdx is 'in' base pointer
-# rbx is loop max count
-# rsi is constants pointer
-# r9 is offsets pointer
-# r8 is 'out' base pointer
-# scratch: rax r10 r11
-
- xorl %eax, %eax
- movq (%rcx), %r9
- movq 0xe0(%rcx), %rsi
-
-# _leaf_ee + 8 needs 16 byte alignment
-#ifdef __APPLE__
- .globl _leaf_ee
-_leaf_ee:
-#else
- .globl leaf_ee
-leaf_ee:
-#endif
- movaps 32(%rsi), %xmm0 #83.5
- movaps (%rsi), %xmm8 #83.5
-LEAF_EE_1:
-LEAF_EE_const_0:
- movaps 0xFECA(%rdx,%rax,4), %xmm7 #83.5
-LEAF_EE_const_2:
- movaps 0xFECA(%rdx,%rax,4), %xmm12 #83.5
- movaps %xmm7, %xmm6 #83.5
-LEAF_EE_const_3:
- movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5
- movaps %xmm12, %xmm11 #83.5
- subps %xmm10, %xmm12 #83.5
- addps %xmm10, %xmm11 #83.5
- xorps %xmm8, %xmm12 #83.5
-LEAF_EE_const_1:
- movaps 0xFECA(%rdx,%rax,4), %xmm9 #83.5
-LEAF_EE_const_4:
- movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5
- addps %xmm9, %xmm6 #83.5
- subps %xmm9, %xmm7 #83.5
-LEAF_EE_const_5:
- movaps 0xFECA(%rdx,%rax,4), %xmm13 #83.5
- movaps %xmm10, %xmm9 #83.5
-LEAF_EE_const_6:
- movaps 0xFECA(%rdx,%rax,4), %xmm3 #83.5
- movaps %xmm6, %xmm5 #83.5
-LEAF_EE_const_7:
- movaps 0xFECA(%rdx,%rax,4), %xmm14 #83.5
- movaps %xmm3, %xmm15 #83.5
- shufps $177, %xmm12, %xmm12 #83.5
- movaps %xmm7, %xmm4 #83.5
- movslq (%r9, %rax, 4), %r10 #83.44
- subps %xmm13, %xmm10 #83.5
- subps %xmm14, %xmm3 #83.5
- addps %xmm11, %xmm5 #83.5
- subps %xmm11, %xmm6 #83.5
- subps %xmm12, %xmm4 #83.5
- addps %xmm12, %xmm7 #83.5
- addps %xmm13, %xmm9 #83.5
- addps %xmm14, %xmm15 #83.5
- movaps 16(%rsi), %xmm12 #83.5
- movaps %xmm9, %xmm1 #83.5
- movaps 16(%rsi), %xmm11 #83.5
- movaps %xmm5, %xmm2 #83.5
- mulps %xmm10, %xmm12 #83.5
- subps %xmm15, %xmm9 #83.5
- addps %xmm15, %xmm1 #83.5
- mulps %xmm3, %xmm11 #83.5
- addps %xmm1, %xmm2 #83.5
- subps %xmm1, %xmm5 #83.5
- shufps $177, %xmm10, %xmm10 #83.5
- xorps %xmm8, %xmm9 #83.5
- shufps $177, %xmm3, %xmm3 #83.5
- movaps %xmm6, %xmm1 #83.5
- mulps %xmm0, %xmm10 #83.5
- movaps %xmm4, %xmm13 #83.5
- mulps %xmm0, %xmm3 #83.5
- subps %xmm10, %xmm12 #83.5
- addps %xmm3, %xmm11 #83.5
- movaps %xmm12, %xmm3 #83.5
- movaps %xmm7, %xmm14 #83.5
- shufps $177, %xmm9, %xmm9 #83.5
- subps %xmm11, %xmm12 #83.5
- addps %xmm11, %xmm3 #83.5
- subps %xmm9, %xmm1 #83.5
- addps %xmm9, %xmm6 #83.5
- addps %xmm3, %xmm4 #83.5
- subps %xmm3, %xmm13 #83.5
- xorps %xmm8, %xmm12 #83.5
- movaps %xmm2, %xmm3 #83.5
- shufps $177, %xmm12, %xmm12 #83.5
- movaps %xmm6, %xmm9 #83.5
- movslq 8(%r9, %rax, 4), %r11 #83.59
- movlhps %xmm4, %xmm3 #83.5
- addq $4, %rax
- shufps $238, %xmm4, %xmm2 #83.5
- movaps %xmm1, %xmm4 #83.5
- subps %xmm12, %xmm7 #83.5
- addps %xmm12, %xmm14 #83.5
- movlhps %xmm7, %xmm4 #83.5
- shufps $238, %xmm7, %xmm1 #83.5
- movaps %xmm5, %xmm7 #83.5
- movlhps %xmm13, %xmm7 #83.5
- movlhps %xmm14, %xmm9 #83.5
- shufps $238, %xmm13, %xmm5 #83.5
- shufps $238, %xmm14, %xmm6 #83.5
- movaps %xmm3, (%r8,%r10,4) #83.5
- movaps %xmm4, 16(%r8,%r10,4) #83.5
- movaps %xmm7, 32(%r8,%r10,4) #83.5
- movaps %xmm9, 48(%r8,%r10,4) #83.5
- movaps %xmm2, (%r8,%r11,4) #83.5
- movaps %xmm1, 16(%r8,%r11,4) #83.5
- movaps %xmm5, 32(%r8,%r11,4) #83.5
- movaps %xmm6, 48(%r8,%r11,4) #83.5
- cmpq %rbx, %rax
- jne LEAF_EE_1
-
-# _leaf_oo + 3 needs to be 16 byte aligned
-#ifdef __APPLE__
- .globl _leaf_oo
-_leaf_oo:
-#else
- .globl leaf_oo
-leaf_oo:
-#endif
- movaps (%rsi), %xmm5 #92.7
-LEAF_OO_1:
-LEAF_OO_const_0:
- movaps 0xFECA(%rdx,%rax,4), %xmm4 #93.5
- movaps %xmm4, %xmm6 #93.5
-LEAF_OO_const_1:
- movaps 0xFECA(%rdx,%rax,4), %xmm7 #93.5
-LEAF_OO_const_2:
- movaps 0xFECA(%rdx,%rax,4), %xmm10 #93.5
- addps %xmm7, %xmm6 #93.5
- subps %xmm7, %xmm4 #93.5
-LEAF_OO_const_3:
- movaps 0xFECA(%rdx,%rax,4), %xmm8 #93.5
- movaps %xmm10, %xmm9 #93.5
-LEAF_OO_const_4:
- movaps 0xFECA(%rdx,%rax,4), %xmm1 #93.5
- movaps %xmm6, %xmm3 #93.5
-LEAF_OO_const_5:
- movaps 0xFECA(%rdx,%rax,4), %xmm11 #93.5
- movaps %xmm1, %xmm2 #93.5
-LEAF_OO_const_6:
- movaps 0xFECA(%rdx,%rax,4), %xmm14 #93.5
- movaps %xmm4, %xmm15 #93.5
-LEAF_OO_const_7:
- movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5
- movaps %xmm14, %xmm13 #93.5
- movslq (%r9, %rax, 4), %r10 #83.44
- subps %xmm8, %xmm10 #93.5
- addps %xmm8, %xmm9 #93.5
- addps %xmm11, %xmm2 #93.5
- subps %xmm12, %xmm14 #93.5
- subps %xmm11, %xmm1 #93.5
- addps %xmm12, %xmm13 #93.5
- addps %xmm9, %xmm3 #93.5
- subps %xmm9, %xmm6 #93.5
- xorps %xmm5, %xmm10 #93.5
- xorps %xmm5, %xmm14 #93.5
- shufps $177, %xmm10, %xmm10 #93.5
- movaps %xmm2, %xmm9 #93.5
- shufps $177, %xmm14, %xmm14 #93.5
- movaps %xmm6, %xmm7 #93.5
- movslq 8(%r9, %rax, 4), %r11 #83.59
- addq $4, %rax #92.18
- addps %xmm10, %xmm4 #93.5
- addps %xmm13, %xmm9 #93.5
- subps %xmm13, %xmm2 #93.5
- subps %xmm10, %xmm15 #93.5
- movaps %xmm1, %xmm13 #93.5
- movaps %xmm2, %xmm8 #93.5
- movlhps %xmm4, %xmm7 #93.5
- subps %xmm14, %xmm13 #93.5
- addps %xmm14, %xmm1 #93.5
- shufps $238, %xmm4, %xmm6 #93.5
- movaps %xmm3, %xmm14 #93.5
- movaps %xmm9, %xmm4 #93.5
- movlhps %xmm15, %xmm14 #93.5
- movlhps %xmm13, %xmm4 #93.5
- movlhps %xmm1, %xmm8 #93.5
- shufps $238, %xmm15, %xmm3 #93.5
- shufps $238, %xmm13, %xmm9 #93.5
- shufps $238, %xmm1, %xmm2 #93.5
- movaps %xmm14, (%r8,%r10,4) #93.5
- movaps %xmm7, 16(%r8,%r10,4) #93.5
- movaps %xmm4, 32(%r8,%r10,4) #93.5
- movaps %xmm8, 48(%r8,%r10,4) #93.5
- movaps %xmm3, (%r8,%r11,4) #93.5
- movaps %xmm6, 16(%r8,%r11,4) #93.5
- movaps %xmm9, 32(%r8,%r11,4) #93.5
- movaps %xmm2, 48(%r8,%r11,4) #93.5
- cmpq %rbx, %rax
- jne LEAF_OO_1 # Prob 95% #92.14
-
-#ifdef __APPLE__
- .globl _leaf_eo
-_leaf_eo:
-#else
- .globl leaf_eo
-leaf_eo:
-#endif
-LEAF_EO_const_0:
- movaps 0xFECA(%rdx,%rax,4), %xmm9 #88.5
-LEAF_EO_const_2:
- movaps 0xFECA(%rdx,%rax,4), %xmm7 #88.5
- movaps %xmm9, %xmm11 #88.5
-LEAF_EO_const_3:
- movaps 0xFECA(%rdx,%rax,4), %xmm5 #88.5
- movaps %xmm7, %xmm6 #88.5
-LEAF_EO_const_1:
- movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5
- subps %xmm5, %xmm7 #88.5
- addps %xmm4, %xmm11 #88.5
- subps %xmm4, %xmm9 #88.5
- addps %xmm5, %xmm6 #88.5
- movaps (%rsi), %xmm3 #88.5
- movaps %xmm11, %xmm10 #88.5
- xorps %xmm3, %xmm7 #88.5
- movaps %xmm9, %xmm8 #88.5
- shufps $177, %xmm7, %xmm7 #88.5
- addps %xmm6, %xmm10 #88.5
- subps %xmm6, %xmm11 #88.5
- subps %xmm7, %xmm8 #88.5
- addps %xmm7, %xmm9 #88.5
- movslq 8(%r9, %rax, 4), %r11 #83.59
- movaps %xmm10, %xmm2 #88.5
- movslq (%r9, %rax, 4), %r10 #83.44
- movaps %xmm11, %xmm1 #88.5
- shufps $238, %xmm8, %xmm10 #88.5
- shufps $238, %xmm9, %xmm11 #88.5
- movaps %xmm10, (%r8,%r11,4) #88.5
- movaps %xmm11, 16(%r8,%r11,4) #88.5
-LEAF_EO_const_4:
- movaps 0xFECA(%rdx,%rax,4), %xmm15 #88.5
-LEAF_EO_const_5:
- movaps 0xFECA(%rdx,%rax,4), %xmm12 #88.5
- movaps %xmm15, %xmm14 #88.5
-LEAF_EO_const_6:
- movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5
- addps %xmm12, %xmm14 #88.5
- subps %xmm12, %xmm15 #88.5
-LEAF_EO_const_7:
- movaps 0xFECA(%rdx,%rax,4), %xmm13 #88.5
- movaps %xmm4, %xmm5 #88.5
- movaps %xmm14, %xmm7 #88.5
- addps %xmm13, %xmm5 #88.5
- subps %xmm13, %xmm4 #88.5
- movlhps %xmm8, %xmm2 #88.5
- movaps %xmm5, %xmm8 #88.5
- movlhps %xmm15, %xmm7 #88.5
- xorps %xmm3, %xmm15 #88.5
- movaps %xmm5, %xmm6 #88.5
- subps %xmm14, %xmm5 #88.5
- addps %xmm14, %xmm6 #88.5
- movlhps %xmm9, %xmm1 #88.5
- movaps %xmm4, %xmm14 #88.5
- movlhps %xmm4, %xmm8 #88.5
- movaps %xmm1, %xmm12 #88.5
- shufps $177, %xmm15, %xmm15 #88.5
- movaps 0x30(%rsi), %xmm11 #88.5
- addq $4, %rax #90.5
- subps %xmm15, %xmm14 #88.5
- mulps %xmm7, %xmm11 #88.5
- addps %xmm15, %xmm4 #88.5
- movaps 0x30(%rsi), %xmm9 #88.5
- movaps 0x40(%rsi), %xmm15 #88.5
- shufps $177, %xmm7, %xmm7 #88.5
- mulps %xmm8, %xmm9 #88.5
- mulps %xmm15, %xmm7 #88.5
- shufps $177, %xmm8, %xmm8 #88.5
- subps %xmm7, %xmm11 #88.5
- mulps %xmm15, %xmm8 #88.5
- movaps %xmm11, %xmm10 #88.5
- addps %xmm8, %xmm9 #88.5
- shufps $238, %xmm14, %xmm6 #88.5
- subps %xmm9, %xmm11 #88.5
- addps %xmm9, %xmm10 #88.5
- xorps %xmm3, %xmm11 #88.5
- movaps %xmm2, %xmm3 #88.5
- shufps $177, %xmm11, %xmm11 #88.5
- subps %xmm10, %xmm3 #88.5
- addps %xmm10, %xmm2 #88.5
- addps %xmm11, %xmm12 #88.5
- subps %xmm11, %xmm1 #88.5
- shufps $238, %xmm4, %xmm5 #88.5
- movaps %xmm5, 48(%r8,%r11,4) #88.5
- movaps %xmm6, 32(%r8,%r11,4) #88.5
- movaps %xmm2, (%r8,%r10,4) #88.5
- movaps %xmm1, 16(%r8,%r10,4) #88.5
- movaps %xmm3, 32(%r8,%r10,4) #88.5
- movaps %xmm12, 48(%r8,%r10,4) #88.5
-
-#ifdef __APPLE__
- .globl _leaf_oe
-_leaf_oe:
-#else
- .globl leaf_oe
-leaf_oe:
-#endif
- movaps (%rsi), %xmm0 #59.5
-LEAF_OE_const_2:
- movaps 0xFECA(%rdx,%rax,4), %xmm6 #70.5
-LEAF_OE_const_3:
- movaps 0xFECA(%rdx,%rax,4), %xmm8 #70.5
- movaps %xmm6, %xmm10 #70.5
- shufps $228, %xmm8, %xmm10 #70.5
- movaps %xmm10, %xmm9 #70.5
- shufps $228, %xmm6, %xmm8 #70.5
-LEAF_OE_const_0:
- movaps 0xFECA(%rdx,%rax,4), %xmm12 #70.5
-LEAF_OE_const_1:
- movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5
- movaps %xmm12, %xmm14 #70.5
- movslq (%r9, %rax, 4), %r10 #83.44
- addps %xmm8, %xmm9 #70.5
- subps %xmm8, %xmm10 #70.5
- addps %xmm7, %xmm14 #70.5
- subps %xmm7, %xmm12 #70.5
- movaps %xmm9, %xmm4 #70.5
- movaps %xmm14, %xmm13 #70.5
- shufps $238, %xmm10, %xmm4 #70.5
- xorps %xmm0, %xmm10 #70.5
- shufps $177, %xmm10, %xmm10 #70.5
- movaps %xmm12, %xmm11 #70.5
- movaps %xmm14, %xmm5 #70.5
- addps %xmm9, %xmm13 #70.5
- subps %xmm10, %xmm11 #70.5
- subps %xmm9, %xmm14 #70.5
- shufps $238, %xmm12, %xmm5 #70.5
- addps %xmm10, %xmm12 #70.5
- movslq 8(%r9, %rax, 4), %r11 #83.59
- movlhps %xmm11, %xmm13 #70.5
- movaps %xmm13, (%r8,%r10,4) #70.5
- movaps 0x30(%rsi), %xmm13 #70.5
- movlhps %xmm12, %xmm14 #70.5
- movaps 0x40(%rsi), %xmm12 #70.5
- mulps %xmm5, %xmm13 #70.5
- shufps $177, %xmm5, %xmm5 #70.5
- mulps %xmm12, %xmm5 #70.5
- movaps %xmm14, 16(%r8,%r10,4) #70.5
- subps %xmm5, %xmm13 #70.5
- movaps 0x30(%rsi), %xmm5 #70.5
- mulps %xmm4, %xmm5 #70.5
- shufps $177, %xmm4, %xmm4 #70.5
- mulps %xmm12, %xmm4 #70.5
-LEAF_OE_const_4:
- movaps 0xFECA(%rdx,%rax,4), %xmm9 #70.5
- addps %xmm4, %xmm5 #70.5
-LEAF_OE_const_6:
- movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5
- movaps %xmm9, %xmm3 #70.5
-LEAF_OE_const_7:
- movaps 0xFECA(%rdx,%rax,4), %xmm2 #70.5
- movaps %xmm7, %xmm6 #70.5
-LEAF_OE_const_5:
- movaps 0xFECA(%rdx,%rax,4), %xmm15 #70.5
- movaps %xmm13, %xmm4 #70.5
- subps %xmm2, %xmm7 #70.5
- addps %xmm15, %xmm3 #70.5
- subps %xmm15, %xmm9 #70.5
- addps %xmm2, %xmm6 #70.5
- subps %xmm5, %xmm13 #70.5
- addps %xmm5, %xmm4 #70.5
- xorps %xmm0, %xmm7 #70.5
- addq $4, %rax #72.5
- movaps %xmm3, %xmm2 #70.5
- shufps $177, %xmm7, %xmm7 #70.5
- movaps %xmm9, %xmm8 #70.5
- xorps %xmm0, %xmm13 #70.5
- addps %xmm6, %xmm2 #70.5
- subps %xmm7, %xmm8 #70.5
- subps %xmm6, %xmm3 #70.5
- addps %xmm7, %xmm9 #70.5
- movaps %xmm2, %xmm10 #70.5
- movaps %xmm3, %xmm11 #70.5
- shufps $238, %xmm8, %xmm2 #70.5
- shufps $238, %xmm9, %xmm3 #70.5
- movaps %xmm2, %xmm14 #70.5
- shufps $177, %xmm13, %xmm13 #70.5
- subps %xmm4, %xmm14 #70.5
- addps %xmm4, %xmm2 #70.5
- movaps %xmm3, %xmm4 #70.5
- subps %xmm13, %xmm3 #70.5
- addps %xmm13, %xmm4 #70.5
- movlhps %xmm8, %xmm10 #70.5
- movlhps %xmm9, %xmm11 #70.5
- movaps %xmm10, 32(%r8,%r10,4) #70.5
- movaps %xmm11, 48(%r8,%r10,4) #70.5
- movaps %xmm2, (%r8,%r11,4) #70.5
- movaps %xmm3, 16(%r8,%r11,4) #70.5
- movaps %xmm14, 32(%r8,%r11,4) #70.5
- movaps %xmm4, 48(%r8,%r11,4) #70.5
-
-#ifdef __APPLE__
- .globl _leaf_end
-_leaf_end:
-#else
- .globl leaf_end
-leaf_end:
-#endif
-
-#ifdef __APPLE__
- .globl _x_init
-_x_init:
-#else
- .globl x_init
-x_init:
-#endif
- movaps (%rsi), %xmm3 #34.3
- movq 0x20(%rcx), %r9
-#ifdef __APPLE__
- .globl _x4
-_x4:
-#else
- .globl x4
-x4:
-#endif
- movaps 64(%r8), %xmm0 #34.3
- movaps 96(%r8), %xmm1 #34.3
- movaps (%r8), %xmm7 #34.3
- movaps (%r9), %xmm4 #const
- movaps %xmm7, %xmm9 #34.3
- movaps %xmm4, %xmm6 #34.3
- movaps 16(%r9), %xmm2 #const
- mulps %xmm0, %xmm6 #34.3
- mulps %xmm1, %xmm4 #34.3
- shufps $177, %xmm0, %xmm0 #34.3
- shufps $177, %xmm1, %xmm1 #34.3
- mulps %xmm2, %xmm0 #34.3
- mulps %xmm1, %xmm2 #34.3
- subps %xmm0, %xmm6 #34.3
- addps %xmm2, %xmm4 #34.3
- movaps %xmm6, %xmm5 #34.3
- subps %xmm4, %xmm6 #34.3
- addps %xmm4, %xmm5 #34.3
- movaps 32(%r8), %xmm8 #34.3
- xorps %xmm3, %xmm6 #34.3
- shufps $177, %xmm6, %xmm6 #34.3
- movaps %xmm8, %xmm10 #34.3
- movaps 112(%r8), %xmm12 #34.3
- subps %xmm5, %xmm9 #34.3
- addps %xmm5, %xmm7 #34.3
- addps %xmm6, %xmm10 #34.3
- subps %xmm6, %xmm8 #34.3
- movaps %xmm7, (%r8) #34.3
- movaps %xmm8, 32(%r8) #34.3
- movaps %xmm9, 64(%r8) #34.3
- movaps %xmm10, 96(%r8) #34.3
- movaps 32(%r9), %xmm14 #const #34.3
- movaps 80(%r8), %xmm11 #34.3
- movaps %xmm14, %xmm0 #34.3
- movaps 48(%r9), %xmm13 #const #34.3
- mulps %xmm11, %xmm0 #34.3
- mulps %xmm12, %xmm14 #34.3
- shufps $177, %xmm11, %xmm11 #34.3
- shufps $177, %xmm12, %xmm12 #34.3
- mulps %xmm13, %xmm11 #34.3
- mulps %xmm12, %xmm13 #34.3
- subps %xmm11, %xmm0 #34.3
- addps %xmm13, %xmm14 #34.3
- movaps %xmm0, %xmm15 #34.3
- subps %xmm14, %xmm0 #34.3
- addps %xmm14, %xmm15 #34.3
- xorps %xmm3, %xmm0 #34.3
- movaps 16(%r8), %xmm1 #34.3
- movaps 48(%r8), %xmm2 #34.3
- movaps %xmm1, %xmm4 #34.3
- shufps $177, %xmm0, %xmm0 #34.3
- movaps %xmm2, %xmm5 #34.3
- addps %xmm15, %xmm1 #34.3
- subps %xmm0, %xmm2 #34.3
- subps %xmm15, %xmm4 #34.3
- addps %xmm0, %xmm5 #34.3
- movaps %xmm1, 16(%r8) #34.3
- movaps %xmm2, 48(%r8) #34.3
- movaps %xmm4, 80(%r8) #34.3
- movaps %xmm5, 112(%r8) #34.3
- ret
-
-# _x8_soft + 6 needs to be 16 byte aligned
-#ifdef __APPLE__
- .globl _x8_soft
-_x8_soft:
-#else
- .globl x8_soft
-x8_soft:
-#endif
- # rax, rcx, rdx, r8, r9, r10, r11
- # rbx, rsi
-
- # input
- movq %r9, %rax
-
- # output
- movq %r8, %rcx
-
- # loop stop (output + output_stride)
- leaq (%r8, %rbx), %rdx
-
- # 3 * output_stride
- leaq (%rbx, %rbx, 2), %rsi
-
- # 5 * output_stride
- leaq (%rbx, %rbx, 4), %r10
-
- # 7 * output_stride
- leaq (%rsi, %rbx, 4), %r11
-
-X8_soft_loop:
- # input + 0 * input_stride
- movaps (%rax), %xmm9
-
- # output + 2 * output_stride
- movaps (%rcx, %rbx, 2), %xmm6
-
- movaps %xmm9, %xmm11
-
- # output + 3 * output_stride
- movaps (%rcx, %rsi), %xmm7
-
- # input + 1 * input_stride
- movaps 16(%rax), %xmm8
-
- mulps %xmm6, %xmm11
- mulps %xmm7, %xmm9
- shufps $177, %xmm6, %xmm6
- mulps %xmm8, %xmm6
- shufps $177, %xmm7, %xmm7
- subps %xmm6, %xmm11
- mulps %xmm7, %xmm8
- movaps %xmm11, %xmm10
- addps %xmm8, %xmm9
-
- # input + 2 * input_stride
- movaps 32(%rax), %xmm15
-
- addps %xmm9, %xmm10
- subps %xmm9, %xmm11
-
- # output + 0 * output_stride
- movaps (%rcx), %xmm5
-
- movaps %xmm15, %xmm6
-
- # output + 4 * output_stride
- movaps (%rcx, %rbx, 4), %xmm12
-
- movaps %xmm5, %xmm2
-
- # output + 6 * output_stride
- movaps (%rcx, %rsi, 2), %xmm13
-
- xorps %xmm3, %xmm11 #const
-
- # input + 3 * input_stride
- movaps 48(%rax), %xmm14
-
- subps %xmm10, %xmm2
- mulps %xmm12, %xmm6
- addps %xmm10, %xmm5
- mulps %xmm13, %xmm15
-
- # input + 4 * input_stride
- movaps 64(%rax), %xmm10
-
- movaps %xmm5, %xmm0
- shufps $177, %xmm12, %xmm12
- shufps $177, %xmm13, %xmm13
- mulps %xmm14, %xmm12
- mulps %xmm13, %xmm14
- subps %xmm12, %xmm6
- addps %xmm14, %xmm15
-
- # output + 5 * output_stride
- movaps (%rcx, %r10), %xmm7
-
- movaps %xmm10, %xmm13
-
- # output + 7 * output_stride
- movaps (%rcx, %r11), %xmm8
-
- movaps %xmm6, %xmm12
-
- # input + 5 * input_stride
- movaps 80(%rax), %xmm9
-
- # input + 6 * input_stride
- addq $96, %rax
-
- mulps %xmm7, %xmm13
- subps %xmm15, %xmm6
- addps %xmm15, %xmm12
- mulps %xmm8, %xmm10
- subps %xmm12, %xmm0
- addps %xmm12, %xmm5
- shufps $177, %xmm7, %xmm7
- xorps %xmm3, %xmm6 #const
- shufps $177, %xmm8, %xmm8
- movaps %xmm2, %xmm12
- mulps %xmm9, %xmm7
- mulps %xmm8, %xmm9
- subps %xmm7, %xmm13
- addps %xmm9, %xmm10
-
- # output + 1 * output_stride
- movaps (%rcx, %rbx), %xmm4
-
- shufps $177, %xmm11, %xmm11
- movaps %xmm4, %xmm1
- shufps $177, %xmm6, %xmm6
- addps %xmm11, %xmm1
- subps %xmm11, %xmm4
- addps %xmm6, %xmm12
- subps %xmm6, %xmm2
- movaps %xmm13, %xmm11
- movaps %xmm4, %xmm14
- movaps %xmm1, %xmm6
- subps %xmm10, %xmm13
- addps %xmm10, %xmm11
- xorps %xmm3, %xmm13 #const
- addps %xmm11, %xmm4
- subps %xmm11, %xmm14
- shufps $177, %xmm13, %xmm13
-
- # output + 0 * output_stride
- movaps %xmm5, (%rcx)
-
- # output + 1 * output_stride
- movaps %xmm4, (%rcx, %rbx)
-
- # output + 2 * output_stride
- movaps %xmm2, (%rcx, %rbx, 2)
-
- subps %xmm13, %xmm1
- addps %xmm13, %xmm6
-
- # output + 3 * output_stride
- movaps %xmm1, (%rcx, %rsi)
-
- # output + 4 * output_stride
- movaps %xmm0, (%rcx, %rbx, 4)
-
- # output + 5 * output_stride
- movaps %xmm14, (%rcx, %r10)
-
- # output + 6 * output_stride
- movaps %xmm12, (%rcx, %rsi, 2)
-
- # output + 7 * output_stride
- movaps %xmm6, (%rcx, %r11)
-
- # output + 8 * output_stride
- addq $16, %rcx
-
- cmpq %rdx, %rcx
- jne X8_soft_loop
- ret
-
-#ifdef __APPLE__
- .globl _x8_soft_end
-_x8_soft_end:
-#else
- .globl x8_soft_end
-x8_soft_end:
-
-#ifdef __APPLE__
- .globl _sse_leaf_ee_offsets
- .globl _sse_leaf_oo_offsets
- .globl _sse_leaf_eo_offsets
- .globl _sse_leaf_oe_offsets
- .align 4
-_sse_leaf_ee_offsets:
- .long LEAF_EE_const_0-_leaf_ee+0x4
- .long LEAF_EE_const_1-_leaf_ee+0x5
- .long LEAF_EE_const_2-_leaf_ee+0x5
- .long LEAF_EE_const_3-_leaf_ee+0x5
- .long LEAF_EE_const_4-_leaf_ee+0x5
- .long LEAF_EE_const_5-_leaf_ee+0x5
- .long LEAF_EE_const_6-_leaf_ee+0x4
- .long LEAF_EE_const_7-_leaf_ee+0x5
-_sse_leaf_oo_offsets:
- .long LEAF_OO_const_0-_leaf_oo+0x4
- .long LEAF_OO_const_1-_leaf_oo+0x4
- .long LEAF_OO_const_2-_leaf_oo+0x5
- .long LEAF_OO_const_3-_leaf_oo+0x5
- .long LEAF_OO_const_4-_leaf_oo+0x4
- .long LEAF_OO_const_5-_leaf_oo+0x5
- .long LEAF_OO_const_6-_leaf_oo+0x5
- .long LEAF_OO_const_7-_leaf_oo+0x5
-_sse_leaf_eo_offsets:
- .long LEAF_EO_const_0-_leaf_eo+0x5
- .long LEAF_EO_const_1-_leaf_eo+0x4
- .long LEAF_EO_const_2-_leaf_eo+0x4
- .long LEAF_EO_const_3-_leaf_eo+0x4
- .long LEAF_EO_const_4-_leaf_eo+0x5
- .long LEAF_EO_const_5-_leaf_eo+0x5
- .long LEAF_EO_const_6-_leaf_eo+0x4
- .long LEAF_EO_const_7-_leaf_eo+0x5
-_sse_leaf_oe_offsets:
- .long LEAF_OE_const_0-_leaf_oe+0x5
- .long LEAF_OE_const_1-_leaf_oe+0x4
- .long LEAF_OE_const_2-_leaf_oe+0x4
- .long LEAF_OE_const_3-_leaf_oe+0x5
- .long LEAF_OE_const_4-_leaf_oe+0x5
- .long LEAF_OE_const_5-_leaf_oe+0x5
- .long LEAF_OE_const_6-_leaf_oe+0x4
- .long LEAF_OE_const_7-_leaf_oe+0x4
-#else
- .globl sse_leaf_ee_offsets
- .globl sse_leaf_oo_offsets
- .globl sse_leaf_eo_offsets
- .globl sse_leaf_oe_offsets
- .align 4
-sse_leaf_ee_offsets:
- .long LEAF_EE_const_0-leaf_ee+0x4
- .long LEAF_EE_const_1-leaf_ee+0x5
- .long LEAF_EE_const_2-leaf_ee+0x5
- .long LEAF_EE_const_3-leaf_ee+0x5
- .long LEAF_EE_const_4-leaf_ee+0x5
- .long LEAF_EE_const_5-leaf_ee+0x5
- .long LEAF_EE_const_6-leaf_ee+0x4
- .long LEAF_EE_const_7-leaf_ee+0x5
-sse_leaf_oo_offsets:
- .long LEAF_OO_const_0-leaf_oo+0x4
- .long LEAF_OO_const_1-leaf_oo+0x4
- .long LEAF_OO_const_2-leaf_oo+0x5
- .long LEAF_OO_const_3-leaf_oo+0x5
- .long LEAF_OO_const_4-leaf_oo+0x4
- .long LEAF_OO_const_5-leaf_oo+0x5
- .long LEAF_OO_const_6-leaf_oo+0x5
- .long LEAF_OO_const_7-leaf_oo+0x5
-sse_leaf_eo_offsets:
- .long LEAF_EO_const_0-leaf_eo+0x5
- .long LEAF_EO_const_1-leaf_eo+0x4
- .long LEAF_EO_const_2-leaf_eo+0x4
- .long LEAF_EO_const_3-leaf_eo+0x4
- .long LEAF_EO_const_4-leaf_eo+0x5
- .long LEAF_EO_const_5-leaf_eo+0x5
- .long LEAF_EO_const_6-leaf_eo+0x4
- .long LEAF_EO_const_7-leaf_eo+0x5
-sse_leaf_oe_offsets:
- .long LEAF_OE_const_0-leaf_oe+0x5
- .long LEAF_OE_const_1-leaf_oe+0x4
- .long LEAF_OE_const_2-leaf_oe+0x4
- .long LEAF_OE_const_3-leaf_oe+0x5
- .long LEAF_OE_const_4-leaf_oe+0x5
- .long LEAF_OE_const_5-leaf_oe+0x5
- .long LEAF_OE_const_6-leaf_oe+0x4
- .long LEAF_OE_const_7-leaf_oe+0x4
-#endif
-
-#ifdef __APPLE__
- .data
-#else
- .section .data
-#endif
- .p2align 4
-#ifdef __APPLE__
- .globl _sse_constants
-_sse_constants:
-#else
- .globl sse_constants
-sse_constants:
-#endif
- .long 0x00000000,0x80000000,0x00000000,0x80000000
- .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
- .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
- .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
- .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
-#ifdef __APPLE__
- .globl _sse_constants_inv
-_sse_constants_inv:
-#else
- .globl sse_constants_inv
-sse_constants_inv:
-#endif
- .long 0x80000000,0x00000000,0x80000000,0x00000000
- .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
- .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
- .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
- .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
OpenPOWER on IntegriCloud