diff options
Diffstat (limited to 'src/sse.s')
-rw-r--r-- | src/sse.s | 885 |
1 files changed, 885 insertions, 0 deletions
diff --git a/src/sse.s b/src/sse.s new file mode 100644 index 0000000..ccdebc8 --- /dev/null +++ b/src/sse.s @@ -0,0 +1,885 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + .code64 + + .globl _neon_x4 + .align 4 +_neon_x4: + + .globl _neon_x8 + .align 4 +_neon_x8: + + .globl _neon_x8_t + .align 4 +_neon_x8_t: + + +#ifdef __APPLE__ + .globl _leaf_ee_init +_leaf_ee_init: +#else + .globl leaf_ee_init +leaf_ee_init: +#endif + #lea L_sse_constants(%rip), %r9 + movq (%rdi), %r8 + movq 0xe0(%rdi), %r9 + xorl %eax, %eax + +# eax is loop counter (init to 0) +# rcx is loop max count +# rsi is 'in' base pointer +# rdx is 'out' base pointer +# r8 is offsets pointer +# r9 is constants pointer +# scratch: rax r11 r12 +# .align 4, 0x90 + +# _leaf_ee + 9 needs 16 byte alignment +#ifdef __APPLE__ + .globl _leaf_ee +_leaf_ee: +#else + .globl leaf_ee +leaf_ee: +#endif + movaps 32(%r9), %xmm0 #83.5 + movaps (%r9), %xmm8 #83.5 +LEAF_EE_1: +LEAF_EE_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5 +LEAF_EE_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5 + movaps %xmm7, %xmm6 #83.5 +LEAF_EE_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + movaps %xmm12, %xmm11 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm10, %xmm11 #83.5 + xorps %xmm8, %xmm12 #83.5 +LEAF_EE_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5 +LEAF_EE_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5 + addps %xmm9, %xmm6 #83.5 + subps %xmm9, %xmm7 #83.5 +LEAF_EE_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5 + movaps %xmm10, %xmm9 #83.5 +LEAF_EE_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5 + movaps %xmm6, %xmm5 #83.5 +LEAF_EE_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5 + movaps %xmm3, %xmm15 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm7, %xmm4 #83.5 + movslq (%r8, %rax, 4), %r11 #83.44 + subps %xmm13, %xmm10 #83.5 + subps %xmm14, %xmm3 #83.5 + addps %xmm11, %xmm5 #83.5 + subps %xmm11, %xmm6 #83.5 + subps %xmm12, %xmm4 #83.5 + addps %xmm12, %xmm7 #83.5 + addps %xmm13, %xmm9 #83.5 + addps %xmm14, %xmm15 #83.5 + movaps 16(%r9), %xmm12 #83.5 + movaps %xmm9, %xmm1 #83.5 + movaps 16(%r9), %xmm11 #83.5 + movaps %xmm5, %xmm2 #83.5 + mulps %xmm10, %xmm12 #83.5 + subps %xmm15, %xmm9 #83.5 + addps %xmm15, %xmm1 #83.5 + mulps %xmm3, %xmm11 #83.5 + addps %xmm1, %xmm2 #83.5 + subps %xmm1, %xmm5 #83.5 + shufps $177, %xmm10, %xmm10 #83.5 + xorps %xmm8, %xmm9 #83.5 + shufps $177, %xmm3, %xmm3 #83.5 + movaps %xmm6, %xmm1 #83.5 + mulps %xmm0, %xmm10 #83.5 + movaps %xmm4, %xmm13 #83.5 + mulps %xmm0, %xmm3 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm3, %xmm11 #83.5 + movaps %xmm12, %xmm3 #83.5 + movaps %xmm7, %xmm14 #83.5 + shufps $177, %xmm9, %xmm9 #83.5 + subps %xmm11, %xmm12 #83.5 + addps %xmm11, %xmm3 #83.5 + subps %xmm9, %xmm1 #83.5 + addps %xmm9, %xmm6 #83.5 + addps %xmm3, %xmm4 #83.5 + subps %xmm3, %xmm13 #83.5 + xorps %xmm8, %xmm12 #83.5 + movaps %xmm2, %xmm3 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm6, %xmm9 #83.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movlhps %xmm4, %xmm3 #83.5 + addq $4, %rax + shufps $238, %xmm4, %xmm2 #83.5 + movaps %xmm1, %xmm4 #83.5 + #movntdq %xmm3, (%rdx,%r11,4) #83.5 + subps %xmm12, %xmm7 #83.5 + addps %xmm12, %xmm14 #83.5 + movlhps %xmm7, %xmm4 #83.5 + shufps $238, %xmm7, %xmm1 #83.5 + movaps %xmm5, %xmm7 #83.5 + movlhps %xmm13, %xmm7 #83.5 + movlhps %xmm14, %xmm9 #83.5 + shufps $238, %xmm13, %xmm5 #83.5 + shufps $238, %xmm14, %xmm6 #83.5 + movaps %xmm3, (%rdx,%r11,4) #83.5 + movaps %xmm4, 16(%rdx,%r11,4) #83.5 + movaps %xmm7, 32(%rdx,%r11,4) #83.5 + movaps %xmm9, 48(%rdx,%r11,4) #83.5 + movaps %xmm2, (%rdx,%r12,4) #83.5 + movaps %xmm1, 16(%rdx,%r12,4) #83.5 + movaps %xmm5, 32(%rdx,%r12,4) #83.5 + movaps %xmm6, 48(%rdx,%r12,4) #83.5 + cmpq %rcx, %rax + jne LEAF_EE_1 + +# _leaf_oo + 4 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _leaf_oo +_leaf_oo: +#else + .globl leaf_oo +leaf_oo: +#endif + movaps (%r9), %xmm5 #92.7 +LEAF_OO_1: +LEAF_OO_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5 + movaps %xmm4, %xmm6 #93.5 +LEAF_OO_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5 +LEAF_OO_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5 + addps %xmm7, %xmm6 #93.5 + subps %xmm7, %xmm4 #93.5 +LEAF_OO_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5 + movaps %xmm10, %xmm9 #93.5 +LEAF_OO_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5 + movaps %xmm6, %xmm3 #93.5 +LEAF_OO_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5 + movaps %xmm1, %xmm2 #93.5 +LEAF_OO_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5 + movaps %xmm4, %xmm15 #93.5 +LEAF_OO_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5 + movaps %xmm14, %xmm13 #93.5 + movslq (%r8, %rax, 4), %r11 #83.44 + subps %xmm8, %xmm10 #93.5 + addps %xmm8, %xmm9 #93.5 + addps %xmm11, %xmm2 #93.5 + subps %xmm12, %xmm14 #93.5 + subps %xmm11, %xmm1 #93.5 + addps %xmm12, %xmm13 #93.5 + addps %xmm9, %xmm3 #93.5 + subps %xmm9, %xmm6 #93.5 + xorps %xmm5, %xmm10 #93.5 + xorps %xmm5, %xmm14 #93.5 + shufps $177, %xmm10, %xmm10 #93.5 + movaps %xmm2, %xmm9 #93.5 + shufps $177, %xmm14, %xmm14 #93.5 + movaps %xmm6, %xmm7 #93.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + addq $4, %rax #92.18 + addps %xmm10, %xmm4 #93.5 + addps %xmm13, %xmm9 #93.5 + subps %xmm13, %xmm2 #93.5 + subps %xmm10, %xmm15 #93.5 + movaps %xmm1, %xmm13 #93.5 + movaps %xmm2, %xmm8 #93.5 + movlhps %xmm4, %xmm7 #93.5 + subps %xmm14, %xmm13 #93.5 + addps %xmm14, %xmm1 #93.5 + shufps $238, %xmm4, %xmm6 #93.5 + movaps %xmm3, %xmm14 #93.5 + movaps %xmm9, %xmm4 #93.5 + movlhps %xmm15, %xmm14 #93.5 + movlhps %xmm13, %xmm4 #93.5 + movlhps %xmm1, %xmm8 #93.5 + shufps $238, %xmm15, %xmm3 #93.5 + shufps $238, %xmm13, %xmm9 #93.5 + shufps $238, %xmm1, %xmm2 #93.5 + movaps %xmm14, (%rdx,%r11,4) #93.5 + movaps %xmm7, 16(%rdx,%r11,4) #93.5 + movaps %xmm4, 32(%rdx,%r11,4) #93.5 + movaps %xmm8, 48(%rdx,%r11,4) #93.5 + movaps %xmm3, (%rdx,%r12,4) #93.5 + movaps %xmm6, 16(%rdx,%r12,4) #93.5 + movaps %xmm9, 32(%rdx,%r12,4) #93.5 + movaps %xmm2, 48(%rdx,%r12,4) #93.5 + cmpq %rcx, %rax + jne LEAF_OO_1 # Prob 95% #92.14 + +#ifdef __APPLE__ + .globl _leaf_eo +_leaf_eo: +#else + .globl leaf_eo +leaf_eo: +#endif +LEAF_EO_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5 +LEAF_EO_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5 + movaps %xmm9, %xmm11 #88.5 +LEAF_EO_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5 + movaps %xmm7, %xmm6 #88.5 +LEAF_EO_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + subps %xmm5, %xmm7 #88.5 + addps %xmm4, %xmm11 #88.5 + subps %xmm4, %xmm9 #88.5 + addps %xmm5, %xmm6 #88.5 + movaps (%r9), %xmm3 #88.5 + movaps %xmm11, %xmm10 #88.5 + xorps %xmm3, %xmm7 #88.5 + movaps %xmm9, %xmm8 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + addps %xmm6, %xmm10 #88.5 + subps %xmm6, %xmm11 #88.5 + subps %xmm7, %xmm8 #88.5 + addps %xmm7, %xmm9 #88.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movaps %xmm10, %xmm2 #88.5 + movslq (%r8, %rax, 4), %r11 #83.44 + movaps %xmm11, %xmm1 #88.5 + shufps $238, %xmm8, %xmm10 #88.5 + shufps $238, %xmm9, %xmm11 #88.5 + movaps %xmm10, (%rdx,%r12,4) #88.5 + movaps %xmm11, 16(%rdx,%r12,4) #88.5 +LEAF_EO_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5 +LEAF_EO_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5 + movaps %xmm15, %xmm14 #88.5 +LEAF_EO_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5 + addps %xmm12, %xmm14 #88.5 + subps %xmm12, %xmm15 #88.5 +LEAF_EO_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5 + movaps %xmm4, %xmm5 #88.5 + movaps %xmm14, %xmm7 #88.5 + addps %xmm13, %xmm5 #88.5 + subps %xmm13, %xmm4 #88.5 + movlhps %xmm8, %xmm2 #88.5 + movaps %xmm5, %xmm8 #88.5 + movlhps %xmm15, %xmm7 #88.5 + xorps %xmm3, %xmm15 #88.5 + movaps %xmm5, %xmm6 #88.5 + subps %xmm14, %xmm5 #88.5 + addps %xmm14, %xmm6 #88.5 + movlhps %xmm9, %xmm1 #88.5 + movaps %xmm4, %xmm14 #88.5 + movlhps %xmm4, %xmm8 #88.5 + movaps %xmm1, %xmm12 #88.5 + shufps $177, %xmm15, %xmm15 #88.5 + movaps 0x30(%r9), %xmm11 #88.5 + addq $4, %rax #90.5 + subps %xmm15, %xmm14 #88.5 + mulps %xmm7, %xmm11 #88.5 + addps %xmm15, %xmm4 #88.5 + movaps 0x30(%r9), %xmm9 #88.5 + movaps 0x40(%r9), %xmm15 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + mulps %xmm8, %xmm9 #88.5 + mulps %xmm15, %xmm7 #88.5 + shufps $177, %xmm8, %xmm8 #88.5 + subps %xmm7, %xmm11 #88.5 + mulps %xmm15, %xmm8 #88.5 + movaps %xmm11, %xmm10 #88.5 + addps %xmm8, %xmm9 #88.5 + shufps $238, %xmm14, %xmm6 #88.5 + subps %xmm9, %xmm11 #88.5 + addps %xmm9, %xmm10 #88.5 + xorps %xmm3, %xmm11 #88.5 + movaps %xmm2, %xmm3 #88.5 + shufps $177, %xmm11, %xmm11 #88.5 + subps %xmm10, %xmm3 #88.5 + addps %xmm10, %xmm2 #88.5 + addps %xmm11, %xmm12 #88.5 + subps %xmm11, %xmm1 #88.5 + shufps $238, %xmm4, %xmm5 #88.5 + movaps %xmm5, 48(%rdx,%r12,4) #88.5 + movaps %xmm6, 32(%rdx,%r12,4) #88.5 + movaps %xmm2, (%rdx,%r11,4) #88.5 + movaps %xmm1, 16(%rdx,%r11,4) #88.5 + movaps %xmm3, 32(%rdx,%r11,4) #88.5 + movaps %xmm12, 48(%rdx,%r11,4) #88.5 + +#ifdef __APPLE__ + .globl _leaf_oe +_leaf_oe: +#else + .globl leaf_oe +leaf_oe: +#endif + movaps (%r9), %xmm0 #59.5 + #movaps 0x20(%r9), %xmm1 #59.5 +LEAF_OE_const_2: + movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5 +LEAF_OE_const_3: + movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5 + movaps %xmm6, %xmm10 #70.5 + shufps $228, %xmm8, %xmm10 #70.5 + movaps %xmm10, %xmm9 #70.5 + shufps $228, %xmm6, %xmm8 #70.5 +LEAF_OE_const_0: + movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5 +LEAF_OE_const_1: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps %xmm12, %xmm14 #70.5 + movslq (%r8, %rax, 4), %r11 #83.44 + addps %xmm8, %xmm9 #70.5 + subps %xmm8, %xmm10 #70.5 + addps %xmm7, %xmm14 #70.5 + subps %xmm7, %xmm12 #70.5 + movaps %xmm9, %xmm4 #70.5 + movaps %xmm14, %xmm13 #70.5 + shufps $238, %xmm10, %xmm4 #70.5 + xorps %xmm0, %xmm10 #70.5 + shufps $177, %xmm10, %xmm10 #70.5 + movaps %xmm12, %xmm11 #70.5 + movaps %xmm14, %xmm5 #70.5 + addps %xmm9, %xmm13 #70.5 + subps %xmm10, %xmm11 #70.5 + subps %xmm9, %xmm14 #70.5 + shufps $238, %xmm12, %xmm5 #70.5 + addps %xmm10, %xmm12 #70.5 + movslq 8(%r8, %rax, 4), %r12 #83.59 + movlhps %xmm11, %xmm13 #70.5 + movaps %xmm13, (%rdx,%r11,4) #70.5 + movaps 0x30(%r9), %xmm13 #70.5 + movlhps %xmm12, %xmm14 #70.5 + movaps 0x40(%r9), %xmm12 #70.5 + mulps %xmm5, %xmm13 #70.5 + shufps $177, %xmm5, %xmm5 #70.5 + mulps %xmm12, %xmm5 #70.5 + movaps %xmm14, 16(%rdx,%r11,4) #70.5 + subps %xmm5, %xmm13 #70.5 + movaps 0x30(%r9), %xmm5 #70.5 + mulps %xmm4, %xmm5 #70.5 + shufps $177, %xmm4, %xmm4 #70.5 + mulps %xmm12, %xmm4 #70.5 +LEAF_OE_const_4: + movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5 + addps %xmm4, %xmm5 #70.5 +LEAF_OE_const_6: + movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5 + movaps %xmm9, %xmm3 #70.5 +LEAF_OE_const_7: + movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5 + movaps %xmm7, %xmm6 #70.5 +LEAF_OE_const_5: + movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5 + movaps %xmm13, %xmm4 #70.5 + subps %xmm2, %xmm7 #70.5 + addps %xmm15, %xmm3 #70.5 + subps %xmm15, %xmm9 #70.5 + addps %xmm2, %xmm6 #70.5 + subps %xmm5, %xmm13 #70.5 + addps %xmm5, %xmm4 #70.5 + xorps %xmm0, %xmm7 #70.5 + addq $4, %rax #72.5 + movaps %xmm3, %xmm2 #70.5 + shufps $177, %xmm7, %xmm7 #70.5 + movaps %xmm9, %xmm8 #70.5 + xorps %xmm0, %xmm13 #70.5 + addps %xmm6, %xmm2 #70.5 + subps %xmm7, %xmm8 #70.5 + subps %xmm6, %xmm3 #70.5 + addps %xmm7, %xmm9 #70.5 + movaps %xmm2, %xmm10 #70.5 + movaps %xmm3, %xmm11 #70.5 + shufps $238, %xmm8, %xmm2 #70.5 + shufps $238, %xmm9, %xmm3 #70.5 + movaps %xmm2, %xmm14 #70.5 + shufps $177, %xmm13, %xmm13 #70.5 + subps %xmm4, %xmm14 #70.5 + addps %xmm4, %xmm2 #70.5 + movaps %xmm3, %xmm4 #70.5 + subps %xmm13, %xmm3 #70.5 + addps %xmm13, %xmm4 #70.5 + movlhps %xmm8, %xmm10 #70.5 + movlhps %xmm9, %xmm11 #70.5 + movaps %xmm10, 32(%rdx,%r11,4) #70.5 + movaps %xmm11, 48(%rdx,%r11,4) #70.5 + movaps %xmm2, (%rdx,%r12,4) #70.5 + movaps %xmm3, 16(%rdx,%r12,4) #70.5 + movaps %xmm14, 32(%rdx,%r12,4) #70.5 + movaps %xmm4, 48(%rdx,%r12,4) #70.5 + +#ifdef __APPLE__ + .globl _leaf_end +_leaf_end: +#else + .globl leaf_end +leaf_end: +#endif + +#ifdef __APPLE__ + .globl _x_init +_x_init: +#else + .globl x_init +x_init: +#endif + #movaps L_sse_constants(%rip), %xmm3 #34.3 + movaps (%r9), %xmm3 #34.3 + movq 0x20(%rdi), %r8 +#ifdef __APPLE__ + .globl _x4 +_x4: +#else + .globl x4 +x4: +#endif + movaps 64(%rdx), %xmm0 #34.3 + movaps 96(%rdx), %xmm1 #34.3 + movaps (%rdx), %xmm7 #34.3 + movaps (%r8), %xmm4 #const + movaps %xmm7, %xmm9 #34.3 + movaps %xmm4, %xmm6 #34.3 + movaps 16(%r8), %xmm2 #const + mulps %xmm0, %xmm6 #34.3 + mulps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + shufps $177, %xmm1, %xmm1 #34.3 + mulps %xmm2, %xmm0 #34.3 + mulps %xmm1, %xmm2 #34.3 + subps %xmm0, %xmm6 #34.3 + addps %xmm2, %xmm4 #34.3 + movaps %xmm6, %xmm5 #34.3 + subps %xmm4, %xmm6 #34.3 + addps %xmm4, %xmm5 #34.3 + movaps 32(%rdx), %xmm8 #34.3 + xorps %xmm3, %xmm6 #34.3 + shufps $177, %xmm6, %xmm6 #34.3 + movaps %xmm8, %xmm10 #34.3 + movaps 112(%rdx), %xmm12 #34.3 + subps %xmm5, %xmm9 #34.3 + addps %xmm5, %xmm7 #34.3 + addps %xmm6, %xmm10 #34.3 + subps %xmm6, %xmm8 #34.3 + movaps %xmm7, (%rdx) #34.3 + movaps %xmm8, 32(%rdx) #34.3 + movaps %xmm9, 64(%rdx) #34.3 + movaps %xmm10, 96(%rdx) #34.3 + movaps 32(%r8), %xmm14 #const #34.3 + movaps 80(%rdx), %xmm11 #34.3 + movaps %xmm14, %xmm0 #34.3 + movaps 48(%r8), %xmm13 #const #34.3 + mulps %xmm11, %xmm0 #34.3 + mulps %xmm12, %xmm14 #34.3 + shufps $177, %xmm11, %xmm11 #34.3 + shufps $177, %xmm12, %xmm12 #34.3 + mulps %xmm13, %xmm11 #34.3 + mulps %xmm12, %xmm13 #34.3 + subps %xmm11, %xmm0 #34.3 + addps %xmm13, %xmm14 #34.3 + movaps %xmm0, %xmm15 #34.3 + subps %xmm14, %xmm0 #34.3 + addps %xmm14, %xmm15 #34.3 + xorps %xmm3, %xmm0 #34.3 + movaps 16(%rdx), %xmm1 #34.3 + movaps 48(%rdx), %xmm2 #34.3 + movaps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + movaps %xmm2, %xmm5 #34.3 + addps %xmm15, %xmm1 #34.3 + subps %xmm0, %xmm2 #34.3 + subps %xmm15, %xmm4 #34.3 + addps %xmm0, %xmm5 #34.3 + movaps %xmm1, 16(%rdx) #34.3 + movaps %xmm2, 48(%rdx) #34.3 + movaps %xmm4, 80(%rdx) #34.3 + movaps %xmm5, 112(%rdx) #34.3 + ret + +# _x8_soft + 5 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _x8_soft +_x8_soft: +#else + .globl x8_soft +x8_soft: +#endif + xorl %eax, %eax + movq %rdx, %rbx + movq %r8, %rsi + leaq (%rdx,%rcx,4), %r9 + leaq (%r9,%rcx,4), %r10 + leaq (%r10,%rcx,4), %r11 + leaq (%r11,%rcx,4), %r12 + leaq (%r12,%rcx,4), %r13 + leaq (%r13,%rcx,4), %r14 + leaq (%r14,%rcx,4), %r15 +X8_soft_loop: + movaps (%rsi), %xmm9 + movaps (%r10,%rax,4), %xmm6 + movaps %xmm9, %xmm11 + movaps (%r11,%rax,4), %xmm7 + movaps 16(%rsi), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%rsi), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 + movaps (%rbx,%rax,4), %xmm5 + movaps %xmm15, %xmm6 + movaps (%r12,%rax,4), %xmm12 + movaps %xmm5, %xmm2 + movaps (%r14,%rax,4), %xmm13 + xorps %xmm3, %xmm11 #const + movaps 48(%rsi), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm5 + mulps %xmm13, %xmm15 + movaps 64(%rsi), %xmm10 + movaps %xmm5, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 + movaps (%r13,%rax,4), %xmm7 + movaps %xmm10, %xmm13 + movaps (%r15,%rax,4), %xmm8 + movaps %xmm6, %xmm12 + movaps 80(%rsi), %xmm9 + addq $96, %rsi + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm5 + shufps $177, %xmm7, %xmm7 + xorps %xmm3, %xmm6 #const + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 + movaps (%r9,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm3, %xmm13 #const + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 + movaps %xmm5, (%rbx,%rax,4) + movaps %xmm4, (%r9,%rax,4) + movaps %xmm2, (%r10,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 + movaps %xmm1, (%r11,%rax,4) + movaps %xmm0, (%r12,%rax,4) + movaps %xmm14, (%r13,%rax,4) + movaps %xmm12, (%r14,%rax,4) + movaps %xmm6, (%r15,%rax,4) + addq $4, %rax + cmpq %rcx, %rax + jne X8_soft_loop + ret + +#ifdef __APPLE__ + .globl _x8_soft_end +_x8_soft_end: +#else + .globl x8_soft_end +x8_soft_end: +#endif + +#ifdef __APPLE__ + .globl _x8_hard +_x8_hard: +#else + .globl x8_hard +x8_hard: +#endif + movaps (%r9), %xmm5 +X8_loop: + movaps (%r8), %xmm9 +X8_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm6 + movaps %xmm9, %xmm11 +X8_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm7 + movaps 16(%r8), %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + movaps 32(%r8), %xmm15 + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 +X8_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm3 + movaps %xmm15, %xmm6 +X8_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm12 + movaps %xmm3, %xmm2 +X8_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm13 + xorps %xmm5, %xmm11 + movaps 48(%r8), %xmm14 + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm3 + mulps %xmm13, %xmm15 + movaps 64(%r8), %xmm10 + movaps %xmm3, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 +X8_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm7 + movaps %xmm10, %xmm13 +X8_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm8 + movaps %xmm6, %xmm12 + movaps 80(%r8), %xmm9 + addq $96, %r8 + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm3 + shufps $177, %xmm7, %xmm7 + xorps %xmm5, %xmm6 + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 +X8_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm4 + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm5, %xmm13 + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 +X8_const1_0: + movaps %xmm3, 0xFECA(%rdx,%rax,4) +X8_const1_1: + movaps %xmm4, 0xFECA(%rdx,%rax,4) +X8_const1_2: + movaps %xmm2, 0xFECA(%rdx,%rax,4) + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 +X8_const1_3: + movaps %xmm1, 0xFECA(%rdx,%rax,4) +X8_const1_4: + movaps %xmm0, 0xFECA(%rdx,%rax,4) +X8_const1_5: + movaps %xmm14, 0xFECA(%rdx,%rax,4) +X8_const1_6: + movaps %xmm12, 0xFECA(%rdx,%rax,4) +X8_const1_7: + movaps %xmm6, 0xFECA(%rdx,%rax,4) + addq $4, %rax + cmpq %rcx, %rax + jne X8_loop + +#ifdef __APPLE__ + .globl _sse_leaf_ee_offsets + .globl _sse_leaf_oo_offsets + .globl _sse_leaf_eo_offsets + .globl _sse_leaf_oe_offsets + .align 4 +_sse_leaf_ee_offsets: + .long LEAF_EE_const_0-_leaf_ee+0x4 + .long LEAF_EE_const_1-_leaf_ee+0x5 + .long LEAF_EE_const_2-_leaf_ee+0x5 + .long LEAF_EE_const_3-_leaf_ee+0x5 + .long LEAF_EE_const_4-_leaf_ee+0x5 + .long LEAF_EE_const_5-_leaf_ee+0x5 + .long LEAF_EE_const_6-_leaf_ee+0x4 + .long LEAF_EE_const_7-_leaf_ee+0x5 +_sse_leaf_oo_offsets: + .long LEAF_OO_const_0-_leaf_oo+0x4 + .long LEAF_OO_const_1-_leaf_oo+0x4 + .long LEAF_OO_const_2-_leaf_oo+0x5 + .long LEAF_OO_const_3-_leaf_oo+0x5 + .long LEAF_OO_const_4-_leaf_oo+0x4 + .long LEAF_OO_const_5-_leaf_oo+0x5 + .long LEAF_OO_const_6-_leaf_oo+0x5 + .long LEAF_OO_const_7-_leaf_oo+0x5 +_sse_leaf_eo_offsets: + .long LEAF_EO_const_0-_leaf_eo+0x5 + .long LEAF_EO_const_1-_leaf_eo+0x4 + .long LEAF_EO_const_2-_leaf_eo+0x4 + .long LEAF_EO_const_3-_leaf_eo+0x4 + .long LEAF_EO_const_4-_leaf_eo+0x5 + .long LEAF_EO_const_5-_leaf_eo+0x5 + .long LEAF_EO_const_6-_leaf_eo+0x4 + .long LEAF_EO_const_7-_leaf_eo+0x5 +_sse_leaf_oe_offsets: + .long LEAF_OE_const_0-_leaf_oe+0x5 + .long LEAF_OE_const_1-_leaf_oe+0x4 + .long LEAF_OE_const_2-_leaf_oe+0x4 + .long LEAF_OE_const_3-_leaf_oe+0x5 + .long LEAF_OE_const_4-_leaf_oe+0x5 + .long LEAF_OE_const_5-_leaf_oe+0x5 + .long LEAF_OE_const_6-_leaf_oe+0x4 + .long LEAF_OE_const_7-_leaf_oe+0x4 +#else + .globl sse_leaf_ee_offsets + .globl sse_leaf_oo_offsets + .globl sse_leaf_eo_offsets + .globl sse_leaf_oe_offsets + .align 4 +sse_leaf_ee_offsets: + .long LEAF_EE_const_0-leaf_ee+0x4 + .long LEAF_EE_const_1-leaf_ee+0x5 + .long LEAF_EE_const_2-leaf_ee+0x5 + .long LEAF_EE_const_3-leaf_ee+0x5 + .long LEAF_EE_const_4-leaf_ee+0x5 + .long LEAF_EE_const_5-leaf_ee+0x5 + .long LEAF_EE_const_6-leaf_ee+0x4 + .long LEAF_EE_const_7-leaf_ee+0x5 +sse_leaf_oo_offsets: + .long LEAF_OO_const_0-leaf_oo+0x4 + .long LEAF_OO_const_1-leaf_oo+0x4 + .long LEAF_OO_const_2-leaf_oo+0x5 + .long LEAF_OO_const_3-leaf_oo+0x5 + .long LEAF_OO_const_4-leaf_oo+0x4 + .long LEAF_OO_const_5-leaf_oo+0x5 + .long LEAF_OO_const_6-leaf_oo+0x5 + .long LEAF_OO_const_7-leaf_oo+0x5 +sse_leaf_eo_offsets: + .long LEAF_EO_const_0-leaf_eo+0x5 + .long LEAF_EO_const_1-leaf_eo+0x4 + .long LEAF_EO_const_2-leaf_eo+0x4 + .long LEAF_EO_const_3-leaf_eo+0x4 + .long LEAF_EO_const_4-leaf_eo+0x5 + .long LEAF_EO_const_5-leaf_eo+0x5 + .long LEAF_EO_const_6-leaf_eo+0x4 + .long LEAF_EO_const_7-leaf_eo+0x5 +sse_leaf_oe_offsets: + .long LEAF_OE_const_0-leaf_oe+0x5 + .long LEAF_OE_const_1-leaf_oe+0x4 + .long LEAF_OE_const_2-leaf_oe+0x4 + .long LEAF_OE_const_3-leaf_oe+0x5 + .long LEAF_OE_const_4-leaf_oe+0x5 + .long LEAF_OE_const_5-leaf_oe+0x5 + .long LEAF_OE_const_6-leaf_oe+0x4 + .long LEAF_OE_const_7-leaf_oe+0x4 +#endif + +#ifdef __APPLE__ + .data +#else + .section .data +#endif + .p2align 4 +#ifdef __APPLE__ + .globl _sse_constants +_sse_constants: +#else + .globl sse_constants +sse_constants: +#endif + .long 0x00000000,0x80000000,0x00000000,0x80000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 +#ifdef __APPLE__ + .globl _sse_constants_inv +_sse_constants_inv: +#else + .globl sse_constants_inv +sse_constants_inv: +#endif + .long 0x80000000,0x00000000,0x80000000,0x00000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 |