diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-29 15:15:13 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-29 15:15:13 +0200 |
commit | 5904d949924cd327dcc21a85464672efd2dc052f (patch) | |
tree | 0cac501e83caa063c29aee385da7260161788f88 /src | |
parent | e6c375a1b098afa907bb25e53adb1e203fe47370 (diff) | |
download | ffts-5904d949924cd327dcc21a85464672efd2dc052f.zip ffts-5904d949924cd327dcc21a85464672efd2dc052f.tar.gz |
YASM requires ".code 64" in assembly
Diffstat (limited to 'src')
-rw-r--r-- | src/sse.s | 754 |
1 files changed, 376 insertions, 378 deletions
@@ -9,14 +9,14 @@ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED @@ -31,30 +31,32 @@ */ + .code64 - .globl _neon_x4 - .align 4 + .globl _neon_x4 + .align 4 _neon_x4: - .globl _neon_x8 - .align 4 + .globl _neon_x8 + .align 4 _neon_x8: - .globl _neon_x8_t - .align 4 + .globl _neon_x8_t + .align 4 _neon_x8_t: #ifdef __APPLE__ - .globl _leaf_ee_init + .globl _leaf_ee_init _leaf_ee_init: #else - .globl leaf_ee_init + .globl leaf_ee_init leaf_ee_init: #endif - #lea L_sse_constants(%rip), %r9 - movq 0xe0(%rdi), %r9 - xorl %eax, %eax + #lea L_sse_constants(%rip), %r9 + movq 0xe0(%rdi), %r9 + xorl %eax, %eax + # eax is loop counter (init to 0) # rcx is loop max count # rsi is 'in' base pointer @@ -62,48 +64,48 @@ leaf_ee_init: # r8 is offsets pointer # r9 is constants pointer # scratch: rax r11 r12 -# .align 4, 0x90 +# .align 4, 0x90 # _leaf_ee + 9 needs 16 byte alignment #ifdef __APPLE__ - .globl _leaf_ee + .globl _leaf_ee _leaf_ee: #else - .globl leaf_ee + .globl leaf_ee leaf_ee: #endif - movaps 32(%r9), %xmm0 #83.5 - movaps (%r9), %xmm8 #83.5 + movaps 32(%r9), %xmm0 #83.5 + movaps (%r9), %xmm8 #83.5 LEAF_EE_1: LEAF_EE_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5 LEAF_EE_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5 movaps %xmm7, %xmm6 #83.5 LEAF_EE_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 movaps %xmm12, %xmm11 #83.5 subps %xmm10, %xmm12 #83.5 addps %xmm10, %xmm11 #83.5 xorps %xmm8, %xmm12 #83.5 LEAF_EE_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5 LEAF_EE_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 addps %xmm9, %xmm6 #83.5 subps %xmm9, %xmm7 #83.5 LEAF_EE_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5 movaps %xmm10, %xmm9 #83.5 LEAF_EE_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5 movaps %xmm6, %xmm5 #83.5 LEAF_EE_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5 + movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5 movaps %xmm3, %xmm15 #83.5 shufps $177, %xmm12, %xmm12 #83.5 movaps %xmm7, %xmm4 #83.5 - movslq (%r8, %rax, 4), %r11 #83.44 + movslq (%r8, %rax, 4), %r11 #83.44 subps %xmm13, %xmm10 #83.5 subps %xmm14, %xmm3 #83.5 addps %xmm11, %xmm5 #83.5 @@ -112,9 +114,9 @@ LEAF_EE_const_7: addps %xmm12, %xmm7 #83.5 addps %xmm13, %xmm9 #83.5 addps %xmm14, %xmm15 #83.5 - movaps 16(%r9), %xmm12 #83.5 + movaps 16(%r9), %xmm12 #83.5 movaps %xmm9, %xmm1 #83.5 - movaps 16(%r9), %xmm11 #83.5 + movaps 16(%r9), %xmm11 #83.5 movaps %xmm5, %xmm2 #83.5 mulps %xmm10, %xmm12 #83.5 subps %xmm15, %xmm9 #83.5 @@ -144,12 +146,12 @@ LEAF_EE_const_7: movaps %xmm2, %xmm3 #83.5 shufps $177, %xmm12, %xmm12 #83.5 movaps %xmm6, %xmm9 #83.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 + movslq 8(%r8, %rax, 4), %r12 #83.59 movlhps %xmm4, %xmm3 #83.5 - addq $4, %rax + addq $4, %rax shufps $238, %xmm4, %xmm2 #83.5 movaps %xmm1, %xmm4 #83.5 - #movntdq %xmm3, (%rdx,%r11,4) #83.5 + #movntdq %xmm3, (%rdx,%r11,4) #83.5 subps %xmm12, %xmm7 #83.5 addps %xmm12, %xmm14 #83.5 movlhps %xmm7, %xmm4 #83.5 @@ -167,46 +169,44 @@ LEAF_EE_const_7: movaps %xmm1, 16(%rdx,%r12,4) #83.5 movaps %xmm5, 32(%rdx,%r12,4) #83.5 movaps %xmm6, 48(%rdx,%r12,4) #83.5 - cmpq %rcx, %rax - jne LEAF_EE_1 - - + cmpq %rcx, %rax + jne LEAF_EE_1 # _leaf_oo + 4 needs to be 16 byte aligned #ifdef __APPLE__ - .globl _leaf_oo + .globl _leaf_oo _leaf_oo: #else - .globl leaf_oo + .globl leaf_oo leaf_oo: #endif - movaps (%r9), %xmm5 #92.7 + movaps (%r9), %xmm5 #92.7 LEAF_OO_1: LEAF_OO_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5 movaps %xmm4, %xmm6 #93.5 LEAF_OO_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5 LEAF_OO_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5 addps %xmm7, %xmm6 #93.5 subps %xmm7, %xmm4 #93.5 LEAF_OO_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5 movaps %xmm10, %xmm9 #93.5 LEAF_OO_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5 movaps %xmm6, %xmm3 #93.5 LEAF_OO_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5 movaps %xmm1, %xmm2 #93.5 LEAF_OO_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5 movaps %xmm4, %xmm15 #93.5 LEAF_OO_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5 + movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5 movaps %xmm14, %xmm13 #93.5 - movslq (%r8, %rax, 4), %r11 #83.44 + movslq (%r8, %rax, 4), %r11 #83.44 subps %xmm8, %xmm10 #93.5 addps %xmm8, %xmm9 #93.5 addps %xmm11, %xmm2 #93.5 @@ -221,8 +221,8 @@ LEAF_OO_const_7: movaps %xmm2, %xmm9 #93.5 shufps $177, %xmm14, %xmm14 #93.5 movaps %xmm6, %xmm7 #93.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 - addq $4, %rax #92.18 + movslq 8(%r8, %rax, 4), %r12 #83.59 + addq $4, %rax #92.18 addps %xmm10, %xmm4 #93.5 addps %xmm13, %xmm9 #93.5 subps %xmm13, %xmm2 #93.5 @@ -249,31 +249,31 @@ LEAF_OO_const_7: movaps %xmm6, 16(%rdx,%r12,4) #93.5 movaps %xmm9, 32(%rdx,%r12,4) #93.5 movaps %xmm2, 48(%rdx,%r12,4) #93.5 - cmpq %rcx, %rax - jne LEAF_OO_1 # Prob 95% #92.14 + cmpq %rcx, %rax + jne LEAF_OO_1 # Prob 95% #92.14 #ifdef __APPLE__ - .globl _leaf_eo + .globl _leaf_eo _leaf_eo: #else - .globl leaf_eo + .globl leaf_eo leaf_eo: #endif LEAF_EO_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5 LEAF_EO_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5 movaps %xmm9, %xmm11 #88.5 LEAF_EO_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5 movaps %xmm7, %xmm6 #88.5 LEAF_EO_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 subps %xmm5, %xmm7 #88.5 addps %xmm4, %xmm11 #88.5 subps %xmm4, %xmm9 #88.5 addps %xmm5, %xmm6 #88.5 - movaps (%r9), %xmm3 #88.5 + movaps (%r9), %xmm3 #88.5 movaps %xmm11, %xmm10 #88.5 xorps %xmm3, %xmm7 #88.5 movaps %xmm9, %xmm8 #88.5 @@ -282,25 +282,25 @@ LEAF_EO_const_1: subps %xmm6, %xmm11 #88.5 subps %xmm7, %xmm8 #88.5 addps %xmm7, %xmm9 #88.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 + movslq 8(%r8, %rax, 4), %r12 #83.59 movaps %xmm10, %xmm2 #88.5 - movslq (%r8, %rax, 4), %r11 #83.44 + movslq (%r8, %rax, 4), %r11 #83.44 movaps %xmm11, %xmm1 #88.5 shufps $238, %xmm8, %xmm10 #88.5 shufps $238, %xmm9, %xmm11 #88.5 movaps %xmm10, (%rdx,%r12,4) #88.5 movaps %xmm11, 16(%rdx,%r12,4) #88.5 LEAF_EO_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5 LEAF_EO_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5 movaps %xmm15, %xmm14 #88.5 LEAF_EO_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 addps %xmm12, %xmm14 #88.5 subps %xmm12, %xmm15 #88.5 LEAF_EO_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5 + movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5 movaps %xmm4, %xmm5 #88.5 movaps %xmm14, %xmm7 #88.5 addps %xmm13, %xmm5 #88.5 @@ -317,13 +317,13 @@ LEAF_EO_const_7: movlhps %xmm4, %xmm8 #88.5 movaps %xmm1, %xmm12 #88.5 shufps $177, %xmm15, %xmm15 #88.5 - movaps 0x30(%r9), %xmm11 #88.5 - addq $4, %rax #90.5 + movaps 0x30(%r9), %xmm11 #88.5 + addq $4, %rax #90.5 subps %xmm15, %xmm14 #88.5 mulps %xmm7, %xmm11 #88.5 addps %xmm15, %xmm4 #88.5 - movaps 0x30(%r9), %xmm9 #88.5 - movaps 0x40(%r9), %xmm15 #88.5 + movaps 0x30(%r9), %xmm9 #88.5 + movaps 0x40(%r9), %xmm15 #88.5 shufps $177, %xmm7, %xmm7 #88.5 mulps %xmm8, %xmm9 #88.5 mulps %xmm15, %xmm7 #88.5 @@ -349,31 +349,30 @@ LEAF_EO_const_7: movaps %xmm1, 16(%rdx,%r11,4) #88.5 movaps %xmm3, 32(%rdx,%r11,4) #88.5 movaps %xmm12, 48(%rdx,%r11,4) #88.5 - #ifdef __APPLE__ - .globl _leaf_oe + .globl _leaf_oe _leaf_oe: #else - .globl leaf_oe + .globl leaf_oe leaf_oe: #endif - movaps (%r9), %xmm0 #59.5 - #movaps 0x20(%r9), %xmm1 #59.5 + movaps (%r9), %xmm0 #59.5 + #movaps 0x20(%r9), %xmm1 #59.5 LEAF_OE_const_2: - movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5 LEAF_OE_const_3: - movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5 movaps %xmm6, %xmm10 #70.5 shufps $228, %xmm8, %xmm10 #70.5 movaps %xmm10, %xmm9 #70.5 shufps $228, %xmm6, %xmm8 #70.5 LEAF_OE_const_0: - movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5 LEAF_OE_const_1: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 movaps %xmm12, %xmm14 #70.5 - movslq (%r8, %rax, 4), %r11 #83.44 + movslq (%r8, %rax, 4), %r11 #83.44 addps %xmm8, %xmm9 #70.5 subps %xmm8, %xmm10 #70.5 addps %xmm7, %xmm14 #70.5 @@ -390,32 +389,32 @@ LEAF_OE_const_1: subps %xmm9, %xmm14 #70.5 shufps $238, %xmm12, %xmm5 #70.5 addps %xmm10, %xmm12 #70.5 - movslq 8(%r8, %rax, 4), %r12 #83.59 + movslq 8(%r8, %rax, 4), %r12 #83.59 movlhps %xmm11, %xmm13 #70.5 movaps %xmm13, (%rdx,%r11,4) #70.5 - movaps 0x30(%r9), %xmm13 #70.5 + movaps 0x30(%r9), %xmm13 #70.5 movlhps %xmm12, %xmm14 #70.5 - movaps 0x40(%r9), %xmm12 #70.5 + movaps 0x40(%r9), %xmm12 #70.5 mulps %xmm5, %xmm13 #70.5 shufps $177, %xmm5, %xmm5 #70.5 mulps %xmm12, %xmm5 #70.5 movaps %xmm14, 16(%rdx,%r11,4) #70.5 subps %xmm5, %xmm13 #70.5 - movaps 0x30(%r9), %xmm5 #70.5 + movaps 0x30(%r9), %xmm5 #70.5 mulps %xmm4, %xmm5 #70.5 shufps $177, %xmm4, %xmm4 #70.5 mulps %xmm12, %xmm4 #70.5 LEAF_OE_const_4: - movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5 addps %xmm4, %xmm5 #70.5 LEAF_OE_const_6: - movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 movaps %xmm9, %xmm3 #70.5 LEAF_OE_const_7: - movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5 movaps %xmm7, %xmm6 #70.5 LEAF_OE_const_5: - movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5 + movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5 movaps %xmm13, %xmm4 #70.5 subps %xmm2, %xmm7 #70.5 addps %xmm15, %xmm3 #70.5 @@ -424,7 +423,7 @@ LEAF_OE_const_5: subps %xmm5, %xmm13 #70.5 addps %xmm5, %xmm4 #70.5 xorps %xmm0, %xmm7 #70.5 - addq $4, %rax #72.5 + addq $4, %rax #72.5 movaps %xmm3, %xmm2 #70.5 shufps $177, %xmm7, %xmm7 #70.5 movaps %xmm9, %xmm8 #70.5 @@ -452,37 +451,36 @@ LEAF_OE_const_5: movaps %xmm3, 16(%rdx,%r12,4) #70.5 movaps %xmm14, 32(%rdx,%r12,4) #70.5 movaps %xmm4, 48(%rdx,%r12,4) #70.5 - - + #ifdef __APPLE__ - .globl _leaf_end + .globl _leaf_end _leaf_end: #else - .globl leaf_end + .globl leaf_end leaf_end: #endif #ifdef __APPLE__ - .globl _x_init + .globl _x_init _x_init: #else - .globl x_init + .globl x_init x_init: #endif - #movaps L_sse_constants(%rip), %xmm3 #34.3 - movaps (%r9), %xmm3 #34.3 - movq 0x20(%rdi),%r8 + #movaps L_sse_constants(%rip), %xmm3 #34.3 + movaps (%r9), %xmm3 #34.3 + movq 0x20(%rdi), %r8 #ifdef __APPLE__ - .globl _x4 + .globl _x4 _x4: #else - .globl x4 + .globl x4 x4: #endif movaps 64(%rdx), %xmm0 #34.3 movaps 96(%rdx), %xmm1 #34.3 movaps (%rdx), %xmm7 #34.3 - movaps (%r8), %xmm4 #const + movaps (%r8), %xmm4 #const movaps %xmm7, %xmm9 #34.3 movaps %xmm4, %xmm6 #34.3 movaps 16(%r8), %xmm2 #const @@ -510,10 +508,10 @@ x4: movaps %xmm8, 32(%rdx) #34.3 movaps %xmm9, 64(%rdx) #34.3 movaps %xmm10, 96(%rdx) #34.3 - movaps 32(%r8), %xmm14 #const #34.3 + movaps 32(%r8), %xmm14 #const #34.3 movaps 80(%rdx), %xmm11 #34.3 movaps %xmm14, %xmm0 #34.3 - movaps 48(%r8), %xmm13 #const #34.3 + movaps 48(%r8), %xmm13 #const #34.3 mulps %xmm11, %xmm0 #34.3 mulps %xmm12, %xmm14 #34.3 shufps $177, %xmm11, %xmm11 #34.3 @@ -539,340 +537,340 @@ x4: movaps %xmm2, 48(%rdx) #34.3 movaps %xmm4, 80(%rdx) #34.3 movaps %xmm5, 112(%rdx) #34.3 - ret - + ret + # _x8_soft + 5 needs to be 16 byte aligned #ifdef __APPLE__ - .globl _x8_soft + .globl _x8_soft _x8_soft: #else - .globl x8_soft + .globl x8_soft x8_soft: #endif - xorl %eax, %eax - movq %rdx, %rbx + xorl %eax, %eax + movq %rdx, %rbx movq %r8, %rsi - leaq (%rdx,%rcx,4), %r9 - leaq (%r9,%rcx,4), %r10 - leaq (%r10,%rcx,4), %r11 - leaq (%r11,%rcx,4), %r12 - leaq (%r12,%rcx,4), %r13 - leaq (%r13,%rcx,4), %r14 - leaq (%r14,%rcx,4), %r15 -X8_soft_loop: - movaps (%rsi), %xmm9 + leaq (%rdx,%rcx,4), %r9 + leaq (%r9,%rcx,4), %r10 + leaq (%r10,%rcx,4), %r11 + leaq (%r11,%rcx,4), %r12 + leaq (%r12,%rcx,4), %r13 + leaq (%r13,%rcx,4), %r14 + leaq (%r14,%rcx,4), %r15 +X8_soft_loop: + movaps (%rsi), %xmm9 movaps (%r10,%rax,4), %xmm6 - movaps %xmm9, %xmm11 + movaps %xmm9, %xmm11 movaps (%r11,%rax,4), %xmm7 - movaps 16(%rsi), %xmm8 - mulps %xmm6, %xmm11 - mulps %xmm7, %xmm9 - shufps $177, %xmm6, %xmm6 - mulps %xmm8, %xmm6 - shufps $177, %xmm7, %xmm7 - subps %xmm6, %xmm11 - mulps %xmm7, %xmm8 - movaps %xmm11, %xmm10 - addps %xmm8, %xmm9 - movaps 32(%rsi), %xmm15 - addps %xmm9, %xmm10 - subps %xmm9, %xmm11 - movaps (%rbx,%rax,4), %xmm5 - movaps %xmm15, %xmm6 + movaps 16(%rsi), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%rsi), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 + movaps (%rbx,%rax,4), %xmm5 + movaps %xmm15, %xmm6 movaps (%r12,%rax,4), %xmm12 - movaps %xmm5, %xmm2 + movaps %xmm5, %xmm2 movaps (%r14,%rax,4), %xmm13 - xorps %xmm3, %xmm11 #const - movaps 48(%rsi), %xmm14 - subps %xmm10, %xmm2 - mulps %xmm12, %xmm6 - addps %xmm10, %xmm5 - mulps %xmm13, %xmm15 - movaps 64(%rsi), %xmm10 - movaps %xmm5, %xmm0 - shufps $177, %xmm12, %xmm12 - shufps $177, %xmm13, %xmm13 - mulps %xmm14, %xmm12 - mulps %xmm13, %xmm14 - subps %xmm12, %xmm6 - addps %xmm14, %xmm15 - movaps (%r13,%rax,4), %xmm7 - movaps %xmm10, %xmm13 - movaps (%r15,%rax,4), %xmm8 - movaps %xmm6, %xmm12 - movaps 80(%rsi), %xmm9 - addq $96, %rsi - mulps %xmm7, %xmm13 - subps %xmm15, %xmm6 - addps %xmm15, %xmm12 - mulps %xmm8, %xmm10 - subps %xmm12, %xmm0 - addps %xmm12, %xmm5 - shufps $177, %xmm7, %xmm7 - xorps %xmm3, %xmm6 #const - shufps $177, %xmm8, %xmm8 - movaps %xmm2, %xmm12 - mulps %xmm9, %xmm7 - mulps %xmm8, %xmm9 - subps %xmm7, %xmm13 - addps %xmm9, %xmm10 - movaps (%r9,%rax,4), %xmm4 - shufps $177, %xmm11, %xmm11 - movaps %xmm4, %xmm1 - shufps $177, %xmm6, %xmm6 - addps %xmm11, %xmm1 - subps %xmm11, %xmm4 - addps %xmm6, %xmm12 - subps %xmm6, %xmm2 - movaps %xmm13, %xmm11 - movaps %xmm4, %xmm14 - movaps %xmm1, %xmm6 - subps %xmm10, %xmm13 - addps %xmm10, %xmm11 - xorps %xmm3, %xmm13 #const - addps %xmm11, %xmm4 - subps %xmm11, %xmm14 - shufps $177, %xmm13, %xmm13 - movaps %xmm5, (%rbx,%rax,4) - movaps %xmm4, (%r9,%rax,4) - movaps %xmm2, (%r10,%rax,4) - subps %xmm13, %xmm1 - addps %xmm13, %xmm6 - movaps %xmm1, (%r11,%rax,4) - movaps %xmm0, (%r12,%rax,4) - movaps %xmm14, (%r13,%rax,4) - movaps %xmm12, (%r14,%rax,4) - movaps %xmm6, (%r15,%rax,4) - addq $4, %rax - cmpq %rcx, %rax + xorps %xmm3, %xmm11 #const + movaps 48(%rsi), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm5 + mulps %xmm13, %xmm15 + movaps 64(%rsi), %xmm10 + movaps %xmm5, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 + movaps (%r13,%rax,4), %xmm7 + movaps %xmm10, %xmm13 + movaps (%r15,%rax,4), %xmm8 + movaps %xmm6, %xmm12 + movaps 80(%rsi), %xmm9 + addq $96, %rsi + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm5 + shufps $177, %xmm7, %xmm7 + xorps %xmm3, %xmm6 #const + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 + movaps (%r9,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm3, %xmm13 #const + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 + movaps %xmm5, (%rbx,%rax,4) + movaps %xmm4, (%r9,%rax,4) + movaps %xmm2, (%r10,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 + movaps %xmm1, (%r11,%rax,4) + movaps %xmm0, (%r12,%rax,4) + movaps %xmm14, (%r13,%rax,4) + movaps %xmm12, (%r14,%rax,4) + movaps %xmm6, (%r15,%rax,4) + addq $4, %rax + cmpq %rcx, %rax jne X8_soft_loop - ret + ret #ifdef __APPLE__ - .globl _x8_hard + .globl _x8_hard _x8_hard: #else - .globl x8_hard + .globl x8_hard x8_hard: #endif - movaps (%r9), %xmm5 -X8_loop: - movaps (%r8), %xmm9 + movaps (%r9), %xmm5 +X8_loop: + movaps (%r8), %xmm9 X8_const_2: - movaps 0xFECA(%rdx,%rax,4), %xmm6 - movaps %xmm9, %xmm11 + movaps 0xFECA(%rdx,%rax,4), %xmm6 + movaps %xmm9, %xmm11 X8_const_3: - movaps 0xFECA(%rdx,%rax,4), %xmm7 - movaps 16(%r8), %xmm8 - mulps %xmm6, %xmm11 - mulps %xmm7, %xmm9 - shufps $177, %xmm6, %xmm6 - mulps %xmm8, %xmm6 - shufps $177, %xmm7, %xmm7 - subps %xmm6, %xmm11 - mulps %xmm7, %xmm8 - movaps %xmm11, %xmm10 - addps %xmm8, %xmm9 - movaps 32(%r8), %xmm15 - addps %xmm9, %xmm10 - subps %xmm9, %xmm11 + movaps 0xFECA(%rdx,%rax,4), %xmm7 + movaps 16(%r8), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%r8), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 X8_const_0: - movaps 0xFECA(%rdx,%rax,4), %xmm3 - movaps %xmm15, %xmm6 + movaps 0xFECA(%rdx,%rax,4), %xmm3 + movaps %xmm15, %xmm6 X8_const_4: movaps 0xFECA(%rdx,%rax,4), %xmm12 - movaps %xmm3, %xmm2 + movaps %xmm3, %xmm2 X8_const_6: movaps 0xFECA(%rdx,%rax,4), %xmm13 - xorps %xmm5, %xmm11 - movaps 48(%r8), %xmm14 - subps %xmm10, %xmm2 - mulps %xmm12, %xmm6 - addps %xmm10, %xmm3 - mulps %xmm13, %xmm15 - movaps 64(%r8), %xmm10 - movaps %xmm3, %xmm0 - shufps $177, %xmm12, %xmm12 - shufps $177, %xmm13, %xmm13 - mulps %xmm14, %xmm12 - mulps %xmm13, %xmm14 - subps %xmm12, %xmm6 - addps %xmm14, %xmm15 + xorps %xmm5, %xmm11 + movaps 48(%r8), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm3 + mulps %xmm13, %xmm15 + movaps 64(%r8), %xmm10 + movaps %xmm3, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 X8_const_5: movaps 0xFECA(%rdx,%rax,4), %xmm7 - movaps %xmm10, %xmm13 + movaps %xmm10, %xmm13 X8_const_7: movaps 0xFECA(%rdx,%rax,4), %xmm8 - movaps %xmm6, %xmm12 - movaps 80(%r8), %xmm9 - addq $96, %r8 - mulps %xmm7, %xmm13 - subps %xmm15, %xmm6 - addps %xmm15, %xmm12 - mulps %xmm8, %xmm10 - subps %xmm12, %xmm0 - addps %xmm12, %xmm3 - shufps $177, %xmm7, %xmm7 - xorps %xmm5, %xmm6 - shufps $177, %xmm8, %xmm8 - movaps %xmm2, %xmm12 - mulps %xmm9, %xmm7 - mulps %xmm8, %xmm9 - subps %xmm7, %xmm13 - addps %xmm9, %xmm10 + movaps %xmm6, %xmm12 + movaps 80(%r8), %xmm9 + addq $96, %r8 + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm3 + shufps $177, %xmm7, %xmm7 + xorps %xmm5, %xmm6 + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 X8_const_1: - movaps 0xFECA(%rdx,%rax,4), %xmm4 - shufps $177, %xmm11, %xmm11 - movaps %xmm4, %xmm1 - shufps $177, %xmm6, %xmm6 - addps %xmm11, %xmm1 - subps %xmm11, %xmm4 - addps %xmm6, %xmm12 - subps %xmm6, %xmm2 - movaps %xmm13, %xmm11 - movaps %xmm4, %xmm14 - movaps %xmm1, %xmm6 - subps %xmm10, %xmm13 - addps %xmm10, %xmm11 - xorps %xmm5, %xmm13 - addps %xmm11, %xmm4 - subps %xmm11, %xmm14 - shufps $177, %xmm13, %xmm13 + movaps 0xFECA(%rdx,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm5, %xmm13 + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 X8_const1_0: movaps %xmm3, 0xFECA(%rdx,%rax,4) X8_const1_1: movaps %xmm4, 0xFECA(%rdx,%rax,4) X8_const1_2: - movaps %xmm2, 0xFECA(%rdx,%rax,4) - subps %xmm13, %xmm1 - addps %xmm13, %xmm6 + movaps %xmm2, 0xFECA(%rdx,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 X8_const1_3: - movaps %xmm1, 0xFECA(%rdx,%rax,4) + movaps %xmm1, 0xFECA(%rdx,%rax,4) X8_const1_4: movaps %xmm0, 0xFECA(%rdx,%rax,4) X8_const1_5: movaps %xmm14, 0xFECA(%rdx,%rax,4) X8_const1_6: - movaps %xmm12, 0xFECA(%rdx,%rax,4) + movaps %xmm12, 0xFECA(%rdx,%rax,4) X8_const1_7: movaps %xmm6, 0xFECA(%rdx,%rax,4) - addq $4, %rax - cmpq %rcx, %rax + addq $4, %rax + cmpq %rcx, %rax jne X8_loop -#ifdef __APPLE__ - .globl _sse_leaf_ee_offsets - .globl _sse_leaf_oo_offsets - .globl _sse_leaf_eo_offsets - .globl _sse_leaf_oe_offsets - .align 4 +#ifdef __APPLE__ + .globl _sse_leaf_ee_offsets + .globl _sse_leaf_oo_offsets + .globl _sse_leaf_eo_offsets + .globl _sse_leaf_oe_offsets + .align 4 _sse_leaf_ee_offsets: - .long LEAF_EE_const_0-_leaf_ee+0x4 - .long LEAF_EE_const_1-_leaf_ee+0x5 - .long LEAF_EE_const_2-_leaf_ee+0x5 - .long LEAF_EE_const_3-_leaf_ee+0x5 - .long LEAF_EE_const_4-_leaf_ee+0x5 - .long LEAF_EE_const_5-_leaf_ee+0x5 - .long LEAF_EE_const_6-_leaf_ee+0x4 - .long LEAF_EE_const_7-_leaf_ee+0x5 + .long LEAF_EE_const_0-_leaf_ee+0x4 + .long LEAF_EE_const_1-_leaf_ee+0x5 + .long LEAF_EE_const_2-_leaf_ee+0x5 + .long LEAF_EE_const_3-_leaf_ee+0x5 + .long LEAF_EE_const_4-_leaf_ee+0x5 + .long LEAF_EE_const_5-_leaf_ee+0x5 + .long LEAF_EE_const_6-_leaf_ee+0x4 + .long LEAF_EE_const_7-_leaf_ee+0x5 _sse_leaf_oo_offsets: - .long LEAF_OO_const_0-_leaf_oo+0x4 - .long LEAF_OO_const_1-_leaf_oo+0x4 - .long LEAF_OO_const_2-_leaf_oo+0x5 - .long LEAF_OO_const_3-_leaf_oo+0x5 - .long LEAF_OO_const_4-_leaf_oo+0x4 - .long LEAF_OO_const_5-_leaf_oo+0x5 - .long LEAF_OO_const_6-_leaf_oo+0x5 - .long LEAF_OO_const_7-_leaf_oo+0x5 + .long LEAF_OO_const_0-_leaf_oo+0x4 + .long LEAF_OO_const_1-_leaf_oo+0x4 + .long LEAF_OO_const_2-_leaf_oo+0x5 + .long LEAF_OO_const_3-_leaf_oo+0x5 + .long LEAF_OO_const_4-_leaf_oo+0x4 + .long LEAF_OO_const_5-_leaf_oo+0x5 + .long LEAF_OO_const_6-_leaf_oo+0x5 + .long LEAF_OO_const_7-_leaf_oo+0x5 _sse_leaf_eo_offsets: - .long LEAF_EO_const_0-_leaf_eo+0x5 - .long LEAF_EO_const_1-_leaf_eo+0x4 - .long LEAF_EO_const_2-_leaf_eo+0x4 - .long LEAF_EO_const_3-_leaf_eo+0x4 - .long LEAF_EO_const_4-_leaf_eo+0x5 - .long LEAF_EO_const_5-_leaf_eo+0x5 - .long LEAF_EO_const_6-_leaf_eo+0x4 - .long LEAF_EO_const_7-_leaf_eo+0x5 + .long LEAF_EO_const_0-_leaf_eo+0x5 + .long LEAF_EO_const_1-_leaf_eo+0x4 + .long LEAF_EO_const_2-_leaf_eo+0x4 + .long LEAF_EO_const_3-_leaf_eo+0x4 + .long LEAF_EO_const_4-_leaf_eo+0x5 + .long LEAF_EO_const_5-_leaf_eo+0x5 + .long LEAF_EO_const_6-_leaf_eo+0x4 + .long LEAF_EO_const_7-_leaf_eo+0x5 _sse_leaf_oe_offsets: - .long LEAF_OE_const_0-_leaf_oe+0x5 - .long LEAF_OE_const_1-_leaf_oe+0x4 - .long LEAF_OE_const_2-_leaf_oe+0x4 - .long LEAF_OE_const_3-_leaf_oe+0x5 - .long LEAF_OE_const_4-_leaf_oe+0x5 - .long LEAF_OE_const_5-_leaf_oe+0x5 - .long LEAF_OE_const_6-_leaf_oe+0x4 - .long LEAF_OE_const_7-_leaf_oe+0x4 + .long LEAF_OE_const_0-_leaf_oe+0x5 + .long LEAF_OE_const_1-_leaf_oe+0x4 + .long LEAF_OE_const_2-_leaf_oe+0x4 + .long LEAF_OE_const_3-_leaf_oe+0x5 + .long LEAF_OE_const_4-_leaf_oe+0x5 + .long LEAF_OE_const_5-_leaf_oe+0x5 + .long LEAF_OE_const_6-_leaf_oe+0x4 + .long LEAF_OE_const_7-_leaf_oe+0x4 #else - .globl sse_leaf_ee_offsets - .globl sse_leaf_oo_offsets - .globl sse_leaf_eo_offsets - .globl sse_leaf_oe_offsets - .align 4 + .globl sse_leaf_ee_offsets + .globl sse_leaf_oo_offsets + .globl sse_leaf_eo_offsets + .globl sse_leaf_oe_offsets + .align 4 sse_leaf_ee_offsets: - .long LEAF_EE_const_0-leaf_ee+0x4 - .long LEAF_EE_const_1-leaf_ee+0x5 - .long LEAF_EE_const_2-leaf_ee+0x5 - .long LEAF_EE_const_3-leaf_ee+0x5 - .long LEAF_EE_const_4-leaf_ee+0x5 - .long LEAF_EE_const_5-leaf_ee+0x5 - .long LEAF_EE_const_6-leaf_ee+0x4 - .long LEAF_EE_const_7-leaf_ee+0x5 + .long LEAF_EE_const_0-leaf_ee+0x4 + .long LEAF_EE_const_1-leaf_ee+0x5 + .long LEAF_EE_const_2-leaf_ee+0x5 + .long LEAF_EE_const_3-leaf_ee+0x5 + .long LEAF_EE_const_4-leaf_ee+0x5 + .long LEAF_EE_const_5-leaf_ee+0x5 + .long LEAF_EE_const_6-leaf_ee+0x4 + .long LEAF_EE_const_7-leaf_ee+0x5 sse_leaf_oo_offsets: - .long LEAF_OO_const_0-leaf_oo+0x4 - .long LEAF_OO_const_1-leaf_oo+0x4 - .long LEAF_OO_const_2-leaf_oo+0x5 - .long LEAF_OO_const_3-leaf_oo+0x5 - .long LEAF_OO_const_4-leaf_oo+0x4 - .long LEAF_OO_const_5-leaf_oo+0x5 - .long LEAF_OO_const_6-leaf_oo+0x5 - .long LEAF_OO_const_7-leaf_oo+0x5 + .long LEAF_OO_const_0-leaf_oo+0x4 + .long LEAF_OO_const_1-leaf_oo+0x4 + .long LEAF_OO_const_2-leaf_oo+0x5 + .long LEAF_OO_const_3-leaf_oo+0x5 + .long LEAF_OO_const_4-leaf_oo+0x4 + .long LEAF_OO_const_5-leaf_oo+0x5 + .long LEAF_OO_const_6-leaf_oo+0x5 + .long LEAF_OO_const_7-leaf_oo+0x5 sse_leaf_eo_offsets: - .long LEAF_EO_const_0-leaf_eo+0x5 - .long LEAF_EO_const_1-leaf_eo+0x4 - .long LEAF_EO_const_2-leaf_eo+0x4 - .long LEAF_EO_const_3-leaf_eo+0x4 - .long LEAF_EO_const_4-leaf_eo+0x5 - .long LEAF_EO_const_5-leaf_eo+0x5 - .long LEAF_EO_const_6-leaf_eo+0x4 - .long LEAF_EO_const_7-leaf_eo+0x5 + .long LEAF_EO_const_0-leaf_eo+0x5 + .long LEAF_EO_const_1-leaf_eo+0x4 + .long LEAF_EO_const_2-leaf_eo+0x4 + .long LEAF_EO_const_3-leaf_eo+0x4 + .long LEAF_EO_const_4-leaf_eo+0x5 + .long LEAF_EO_const_5-leaf_eo+0x5 + .long LEAF_EO_const_6-leaf_eo+0x4 + .long LEAF_EO_const_7-leaf_eo+0x5 sse_leaf_oe_offsets: - .long LEAF_OE_const_0-leaf_oe+0x5 - .long LEAF_OE_const_1-leaf_oe+0x4 - .long LEAF_OE_const_2-leaf_oe+0x4 - .long LEAF_OE_const_3-leaf_oe+0x5 - .long LEAF_OE_const_4-leaf_oe+0x5 - .long LEAF_OE_const_5-leaf_oe+0x5 - .long LEAF_OE_const_6-leaf_oe+0x4 - .long LEAF_OE_const_7-leaf_oe+0x4 + .long LEAF_OE_const_0-leaf_oe+0x5 + .long LEAF_OE_const_1-leaf_oe+0x4 + .long LEAF_OE_const_2-leaf_oe+0x4 + .long LEAF_OE_const_3-leaf_oe+0x5 + .long LEAF_OE_const_4-leaf_oe+0x5 + .long LEAF_OE_const_5-leaf_oe+0x5 + .long LEAF_OE_const_6-leaf_oe+0x4 + .long LEAF_OE_const_7-leaf_oe+0x4 #endif #ifdef __APPLE__ - .data + .data #else - .section .data + .section .data #endif - .p2align 4 -#ifdef __APPLE__ - .globl _sse_constants + .p2align 4 +#ifdef __APPLE__ + .globl _sse_constants _sse_constants: #else - .globl sse_constants + .globl sse_constants sse_constants: #endif - .long 0x00000000,0x80000000,0x00000000,0x80000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 -#ifdef __APPLE__ - .globl _sse_constants_inv + .long 0x00000000,0x80000000,0x00000000,0x80000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 +#ifdef __APPLE__ + .globl _sse_constants_inv _sse_constants_inv: #else - .globl sse_constants_inv + .globl sse_constants_inv sse_constants_inv: #endif - .long 0x80000000,0x00000000,0x80000000,0x00000000 - .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 - .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 - .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 - .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 + .long 0x80000000,0x00000000,0x80000000,0x00000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 |