summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-29 15:15:13 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-29 15:15:13 +0200
commit5904d949924cd327dcc21a85464672efd2dc052f (patch)
tree0cac501e83caa063c29aee385da7260161788f88 /src
parente6c375a1b098afa907bb25e53adb1e203fe47370 (diff)
downloadffts-5904d949924cd327dcc21a85464672efd2dc052f.zip
ffts-5904d949924cd327dcc21a85464672efd2dc052f.tar.gz
YASM requires ".code 64" in assembly
Diffstat (limited to 'src')
-rw-r--r--src/sse.s754
1 files changed, 376 insertions, 378 deletions
diff --git a/src/sse.s b/src/sse.s
index 79dd6ec..90f02db 100644
--- a/src/sse.s
+++ b/src/sse.s
@@ -9,14 +9,14 @@
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -31,30 +31,32 @@
*/
+ .code64
- .globl _neon_x4
- .align 4
+ .globl _neon_x4
+ .align 4
_neon_x4:
- .globl _neon_x8
- .align 4
+ .globl _neon_x8
+ .align 4
_neon_x8:
- .globl _neon_x8_t
- .align 4
+ .globl _neon_x8_t
+ .align 4
_neon_x8_t:
#ifdef __APPLE__
- .globl _leaf_ee_init
+ .globl _leaf_ee_init
_leaf_ee_init:
#else
- .globl leaf_ee_init
+ .globl leaf_ee_init
leaf_ee_init:
#endif
- #lea L_sse_constants(%rip), %r9
- movq 0xe0(%rdi), %r9
- xorl %eax, %eax
+ #lea L_sse_constants(%rip), %r9
+ movq 0xe0(%rdi), %r9
+ xorl %eax, %eax
+
# eax is loop counter (init to 0)
# rcx is loop max count
# rsi is 'in' base pointer
@@ -62,48 +64,48 @@ leaf_ee_init:
# r8 is offsets pointer
# r9 is constants pointer
# scratch: rax r11 r12
-# .align 4, 0x90
+# .align 4, 0x90
# _leaf_ee + 9 needs 16 byte alignment
#ifdef __APPLE__
- .globl _leaf_ee
+ .globl _leaf_ee
_leaf_ee:
#else
- .globl leaf_ee
+ .globl leaf_ee
leaf_ee:
#endif
- movaps 32(%r9), %xmm0 #83.5
- movaps (%r9), %xmm8 #83.5
+ movaps 32(%r9), %xmm0 #83.5
+ movaps (%r9), %xmm8 #83.5
LEAF_EE_1:
LEAF_EE_const_0:
- movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
LEAF_EE_const_2:
- movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
movaps %xmm7, %xmm6 #83.5
LEAF_EE_const_3:
- movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
movaps %xmm12, %xmm11 #83.5
subps %xmm10, %xmm12 #83.5
addps %xmm10, %xmm11 #83.5
xorps %xmm8, %xmm12 #83.5
LEAF_EE_const_1:
- movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
LEAF_EE_const_4:
- movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
addps %xmm9, %xmm6 #83.5
subps %xmm9, %xmm7 #83.5
LEAF_EE_const_5:
- movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
movaps %xmm10, %xmm9 #83.5
LEAF_EE_const_6:
- movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
movaps %xmm6, %xmm5 #83.5
LEAF_EE_const_7:
- movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
movaps %xmm3, %xmm15 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm7, %xmm4 #83.5
- movslq (%r8, %rax, 4), %r11 #83.44
+ movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm13, %xmm10 #83.5
subps %xmm14, %xmm3 #83.5
addps %xmm11, %xmm5 #83.5
@@ -112,9 +114,9 @@ LEAF_EE_const_7:
addps %xmm12, %xmm7 #83.5
addps %xmm13, %xmm9 #83.5
addps %xmm14, %xmm15 #83.5
- movaps 16(%r9), %xmm12 #83.5
+ movaps 16(%r9), %xmm12 #83.5
movaps %xmm9, %xmm1 #83.5
- movaps 16(%r9), %xmm11 #83.5
+ movaps 16(%r9), %xmm11 #83.5
movaps %xmm5, %xmm2 #83.5
mulps %xmm10, %xmm12 #83.5
subps %xmm15, %xmm9 #83.5
@@ -144,12 +146,12 @@ LEAF_EE_const_7:
movaps %xmm2, %xmm3 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm6, %xmm9 #83.5
- movslq 8(%r8, %rax, 4), %r12 #83.59
+ movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm4, %xmm3 #83.5
- addq $4, %rax
+ addq $4, %rax
shufps $238, %xmm4, %xmm2 #83.5
movaps %xmm1, %xmm4 #83.5
- #movntdq %xmm3, (%rdx,%r11,4) #83.5
+ #movntdq %xmm3, (%rdx,%r11,4) #83.5
subps %xmm12, %xmm7 #83.5
addps %xmm12, %xmm14 #83.5
movlhps %xmm7, %xmm4 #83.5
@@ -167,46 +169,44 @@ LEAF_EE_const_7:
movaps %xmm1, 16(%rdx,%r12,4) #83.5
movaps %xmm5, 32(%rdx,%r12,4) #83.5
movaps %xmm6, 48(%rdx,%r12,4) #83.5
- cmpq %rcx, %rax
- jne LEAF_EE_1
-
-
+ cmpq %rcx, %rax
+ jne LEAF_EE_1
# _leaf_oo + 4 needs to be 16 byte aligned
#ifdef __APPLE__
- .globl _leaf_oo
+ .globl _leaf_oo
_leaf_oo:
#else
- .globl leaf_oo
+ .globl leaf_oo
leaf_oo:
#endif
- movaps (%r9), %xmm5 #92.7
+ movaps (%r9), %xmm5 #92.7
LEAF_OO_1:
LEAF_OO_const_0:
- movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
movaps %xmm4, %xmm6 #93.5
LEAF_OO_const_1:
- movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
LEAF_OO_const_2:
- movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
addps %xmm7, %xmm6 #93.5
subps %xmm7, %xmm4 #93.5
LEAF_OO_const_3:
- movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
movaps %xmm10, %xmm9 #93.5
LEAF_OO_const_4:
- movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
movaps %xmm6, %xmm3 #93.5
LEAF_OO_const_5:
- movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
movaps %xmm1, %xmm2 #93.5
LEAF_OO_const_6:
- movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
movaps %xmm4, %xmm15 #93.5
LEAF_OO_const_7:
- movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
movaps %xmm14, %xmm13 #93.5
- movslq (%r8, %rax, 4), %r11 #83.44
+ movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm8, %xmm10 #93.5
addps %xmm8, %xmm9 #93.5
addps %xmm11, %xmm2 #93.5
@@ -221,8 +221,8 @@ LEAF_OO_const_7:
movaps %xmm2, %xmm9 #93.5
shufps $177, %xmm14, %xmm14 #93.5
movaps %xmm6, %xmm7 #93.5
- movslq 8(%r8, %rax, 4), %r12 #83.59
- addq $4, %rax #92.18
+ movslq 8(%r8, %rax, 4), %r12 #83.59
+ addq $4, %rax #92.18
addps %xmm10, %xmm4 #93.5
addps %xmm13, %xmm9 #93.5
subps %xmm13, %xmm2 #93.5
@@ -249,31 +249,31 @@ LEAF_OO_const_7:
movaps %xmm6, 16(%rdx,%r12,4) #93.5
movaps %xmm9, 32(%rdx,%r12,4) #93.5
movaps %xmm2, 48(%rdx,%r12,4) #93.5
- cmpq %rcx, %rax
- jne LEAF_OO_1 # Prob 95% #92.14
+ cmpq %rcx, %rax
+ jne LEAF_OO_1 # Prob 95% #92.14
#ifdef __APPLE__
- .globl _leaf_eo
+ .globl _leaf_eo
_leaf_eo:
#else
- .globl leaf_eo
+ .globl leaf_eo
leaf_eo:
#endif
LEAF_EO_const_0:
- movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
LEAF_EO_const_2:
- movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
movaps %xmm9, %xmm11 #88.5
LEAF_EO_const_3:
- movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
movaps %xmm7, %xmm6 #88.5
LEAF_EO_const_1:
- movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
subps %xmm5, %xmm7 #88.5
addps %xmm4, %xmm11 #88.5
subps %xmm4, %xmm9 #88.5
addps %xmm5, %xmm6 #88.5
- movaps (%r9), %xmm3 #88.5
+ movaps (%r9), %xmm3 #88.5
movaps %xmm11, %xmm10 #88.5
xorps %xmm3, %xmm7 #88.5
movaps %xmm9, %xmm8 #88.5
@@ -282,25 +282,25 @@ LEAF_EO_const_1:
subps %xmm6, %xmm11 #88.5
subps %xmm7, %xmm8 #88.5
addps %xmm7, %xmm9 #88.5
- movslq 8(%r8, %rax, 4), %r12 #83.59
+ movslq 8(%r8, %rax, 4), %r12 #83.59
movaps %xmm10, %xmm2 #88.5
- movslq (%r8, %rax, 4), %r11 #83.44
+ movslq (%r8, %rax, 4), %r11 #83.44
movaps %xmm11, %xmm1 #88.5
shufps $238, %xmm8, %xmm10 #88.5
shufps $238, %xmm9, %xmm11 #88.5
movaps %xmm10, (%rdx,%r12,4) #88.5
movaps %xmm11, 16(%rdx,%r12,4) #88.5
LEAF_EO_const_4:
- movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
LEAF_EO_const_5:
- movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
movaps %xmm15, %xmm14 #88.5
LEAF_EO_const_6:
- movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
addps %xmm12, %xmm14 #88.5
subps %xmm12, %xmm15 #88.5
LEAF_EO_const_7:
- movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
movaps %xmm4, %xmm5 #88.5
movaps %xmm14, %xmm7 #88.5
addps %xmm13, %xmm5 #88.5
@@ -317,13 +317,13 @@ LEAF_EO_const_7:
movlhps %xmm4, %xmm8 #88.5
movaps %xmm1, %xmm12 #88.5
shufps $177, %xmm15, %xmm15 #88.5
- movaps 0x30(%r9), %xmm11 #88.5
- addq $4, %rax #90.5
+ movaps 0x30(%r9), %xmm11 #88.5
+ addq $4, %rax #90.5
subps %xmm15, %xmm14 #88.5
mulps %xmm7, %xmm11 #88.5
addps %xmm15, %xmm4 #88.5
- movaps 0x30(%r9), %xmm9 #88.5
- movaps 0x40(%r9), %xmm15 #88.5
+ movaps 0x30(%r9), %xmm9 #88.5
+ movaps 0x40(%r9), %xmm15 #88.5
shufps $177, %xmm7, %xmm7 #88.5
mulps %xmm8, %xmm9 #88.5
mulps %xmm15, %xmm7 #88.5
@@ -349,31 +349,30 @@ LEAF_EO_const_7:
movaps %xmm1, 16(%rdx,%r11,4) #88.5
movaps %xmm3, 32(%rdx,%r11,4) #88.5
movaps %xmm12, 48(%rdx,%r11,4) #88.5
-
#ifdef __APPLE__
- .globl _leaf_oe
+ .globl _leaf_oe
_leaf_oe:
#else
- .globl leaf_oe
+ .globl leaf_oe
leaf_oe:
#endif
- movaps (%r9), %xmm0 #59.5
- #movaps 0x20(%r9), %xmm1 #59.5
+ movaps (%r9), %xmm0 #59.5
+ #movaps 0x20(%r9), %xmm1 #59.5
LEAF_OE_const_2:
- movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
LEAF_OE_const_3:
- movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
movaps %xmm6, %xmm10 #70.5
shufps $228, %xmm8, %xmm10 #70.5
movaps %xmm10, %xmm9 #70.5
shufps $228, %xmm6, %xmm8 #70.5
LEAF_OE_const_0:
- movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
LEAF_OE_const_1:
- movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm12, %xmm14 #70.5
- movslq (%r8, %rax, 4), %r11 #83.44
+ movslq (%r8, %rax, 4), %r11 #83.44
addps %xmm8, %xmm9 #70.5
subps %xmm8, %xmm10 #70.5
addps %xmm7, %xmm14 #70.5
@@ -390,32 +389,32 @@ LEAF_OE_const_1:
subps %xmm9, %xmm14 #70.5
shufps $238, %xmm12, %xmm5 #70.5
addps %xmm10, %xmm12 #70.5
- movslq 8(%r8, %rax, 4), %r12 #83.59
+ movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm11, %xmm13 #70.5
movaps %xmm13, (%rdx,%r11,4) #70.5
- movaps 0x30(%r9), %xmm13 #70.5
+ movaps 0x30(%r9), %xmm13 #70.5
movlhps %xmm12, %xmm14 #70.5
- movaps 0x40(%r9), %xmm12 #70.5
+ movaps 0x40(%r9), %xmm12 #70.5
mulps %xmm5, %xmm13 #70.5
shufps $177, %xmm5, %xmm5 #70.5
mulps %xmm12, %xmm5 #70.5
movaps %xmm14, 16(%rdx,%r11,4) #70.5
subps %xmm5, %xmm13 #70.5
- movaps 0x30(%r9), %xmm5 #70.5
+ movaps 0x30(%r9), %xmm5 #70.5
mulps %xmm4, %xmm5 #70.5
shufps $177, %xmm4, %xmm4 #70.5
mulps %xmm12, %xmm4 #70.5
LEAF_OE_const_4:
- movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
addps %xmm4, %xmm5 #70.5
LEAF_OE_const_6:
- movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm9, %xmm3 #70.5
LEAF_OE_const_7:
- movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
movaps %xmm7, %xmm6 #70.5
LEAF_OE_const_5:
- movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
+ movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
movaps %xmm13, %xmm4 #70.5
subps %xmm2, %xmm7 #70.5
addps %xmm15, %xmm3 #70.5
@@ -424,7 +423,7 @@ LEAF_OE_const_5:
subps %xmm5, %xmm13 #70.5
addps %xmm5, %xmm4 #70.5
xorps %xmm0, %xmm7 #70.5
- addq $4, %rax #72.5
+ addq $4, %rax #72.5
movaps %xmm3, %xmm2 #70.5
shufps $177, %xmm7, %xmm7 #70.5
movaps %xmm9, %xmm8 #70.5
@@ -452,37 +451,36 @@ LEAF_OE_const_5:
movaps %xmm3, 16(%rdx,%r12,4) #70.5
movaps %xmm14, 32(%rdx,%r12,4) #70.5
movaps %xmm4, 48(%rdx,%r12,4) #70.5
-
-
+
#ifdef __APPLE__
- .globl _leaf_end
+ .globl _leaf_end
_leaf_end:
#else
- .globl leaf_end
+ .globl leaf_end
leaf_end:
#endif
#ifdef __APPLE__
- .globl _x_init
+ .globl _x_init
_x_init:
#else
- .globl x_init
+ .globl x_init
x_init:
#endif
- #movaps L_sse_constants(%rip), %xmm3 #34.3
- movaps (%r9), %xmm3 #34.3
- movq 0x20(%rdi),%r8
+ #movaps L_sse_constants(%rip), %xmm3 #34.3
+ movaps (%r9), %xmm3 #34.3
+ movq 0x20(%rdi), %r8
#ifdef __APPLE__
- .globl _x4
+ .globl _x4
_x4:
#else
- .globl x4
+ .globl x4
x4:
#endif
movaps 64(%rdx), %xmm0 #34.3
movaps 96(%rdx), %xmm1 #34.3
movaps (%rdx), %xmm7 #34.3
- movaps (%r8), %xmm4 #const
+ movaps (%r8), %xmm4 #const
movaps %xmm7, %xmm9 #34.3
movaps %xmm4, %xmm6 #34.3
movaps 16(%r8), %xmm2 #const
@@ -510,10 +508,10 @@ x4:
movaps %xmm8, 32(%rdx) #34.3
movaps %xmm9, 64(%rdx) #34.3
movaps %xmm10, 96(%rdx) #34.3
- movaps 32(%r8), %xmm14 #const #34.3
+ movaps 32(%r8), %xmm14 #const #34.3
movaps 80(%rdx), %xmm11 #34.3
movaps %xmm14, %xmm0 #34.3
- movaps 48(%r8), %xmm13 #const #34.3
+ movaps 48(%r8), %xmm13 #const #34.3
mulps %xmm11, %xmm0 #34.3
mulps %xmm12, %xmm14 #34.3
shufps $177, %xmm11, %xmm11 #34.3
@@ -539,340 +537,340 @@ x4:
movaps %xmm2, 48(%rdx) #34.3
movaps %xmm4, 80(%rdx) #34.3
movaps %xmm5, 112(%rdx) #34.3
- ret
-
+ ret
+
# _x8_soft + 5 needs to be 16 byte aligned
#ifdef __APPLE__
- .globl _x8_soft
+ .globl _x8_soft
_x8_soft:
#else
- .globl x8_soft
+ .globl x8_soft
x8_soft:
#endif
- xorl %eax, %eax
- movq %rdx, %rbx
+ xorl %eax, %eax
+ movq %rdx, %rbx
movq %r8, %rsi
- leaq (%rdx,%rcx,4), %r9
- leaq (%r9,%rcx,4), %r10
- leaq (%r10,%rcx,4), %r11
- leaq (%r11,%rcx,4), %r12
- leaq (%r12,%rcx,4), %r13
- leaq (%r13,%rcx,4), %r14
- leaq (%r14,%rcx,4), %r15
-X8_soft_loop:
- movaps (%rsi), %xmm9
+ leaq (%rdx,%rcx,4), %r9
+ leaq (%r9,%rcx,4), %r10
+ leaq (%r10,%rcx,4), %r11
+ leaq (%r11,%rcx,4), %r12
+ leaq (%r12,%rcx,4), %r13
+ leaq (%r13,%rcx,4), %r14
+ leaq (%r14,%rcx,4), %r15
+X8_soft_loop:
+ movaps (%rsi), %xmm9
movaps (%r10,%rax,4), %xmm6
- movaps %xmm9, %xmm11
+ movaps %xmm9, %xmm11
movaps (%r11,%rax,4), %xmm7
- movaps 16(%rsi), %xmm8
- mulps %xmm6, %xmm11
- mulps %xmm7, %xmm9
- shufps $177, %xmm6, %xmm6
- mulps %xmm8, %xmm6
- shufps $177, %xmm7, %xmm7
- subps %xmm6, %xmm11
- mulps %xmm7, %xmm8
- movaps %xmm11, %xmm10
- addps %xmm8, %xmm9
- movaps 32(%rsi), %xmm15
- addps %xmm9, %xmm10
- subps %xmm9, %xmm11
- movaps (%rbx,%rax,4), %xmm5
- movaps %xmm15, %xmm6
+ movaps 16(%rsi), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%rsi), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+ movaps (%rbx,%rax,4), %xmm5
+ movaps %xmm15, %xmm6
movaps (%r12,%rax,4), %xmm12
- movaps %xmm5, %xmm2
+ movaps %xmm5, %xmm2
movaps (%r14,%rax,4), %xmm13
- xorps %xmm3, %xmm11 #const
- movaps 48(%rsi), %xmm14
- subps %xmm10, %xmm2
- mulps %xmm12, %xmm6
- addps %xmm10, %xmm5
- mulps %xmm13, %xmm15
- movaps 64(%rsi), %xmm10
- movaps %xmm5, %xmm0
- shufps $177, %xmm12, %xmm12
- shufps $177, %xmm13, %xmm13
- mulps %xmm14, %xmm12
- mulps %xmm13, %xmm14
- subps %xmm12, %xmm6
- addps %xmm14, %xmm15
- movaps (%r13,%rax,4), %xmm7
- movaps %xmm10, %xmm13
- movaps (%r15,%rax,4), %xmm8
- movaps %xmm6, %xmm12
- movaps 80(%rsi), %xmm9
- addq $96, %rsi
- mulps %xmm7, %xmm13
- subps %xmm15, %xmm6
- addps %xmm15, %xmm12
- mulps %xmm8, %xmm10
- subps %xmm12, %xmm0
- addps %xmm12, %xmm5
- shufps $177, %xmm7, %xmm7
- xorps %xmm3, %xmm6 #const
- shufps $177, %xmm8, %xmm8
- movaps %xmm2, %xmm12
- mulps %xmm9, %xmm7
- mulps %xmm8, %xmm9
- subps %xmm7, %xmm13
- addps %xmm9, %xmm10
- movaps (%r9,%rax,4), %xmm4
- shufps $177, %xmm11, %xmm11
- movaps %xmm4, %xmm1
- shufps $177, %xmm6, %xmm6
- addps %xmm11, %xmm1
- subps %xmm11, %xmm4
- addps %xmm6, %xmm12
- subps %xmm6, %xmm2
- movaps %xmm13, %xmm11
- movaps %xmm4, %xmm14
- movaps %xmm1, %xmm6
- subps %xmm10, %xmm13
- addps %xmm10, %xmm11
- xorps %xmm3, %xmm13 #const
- addps %xmm11, %xmm4
- subps %xmm11, %xmm14
- shufps $177, %xmm13, %xmm13
- movaps %xmm5, (%rbx,%rax,4)
- movaps %xmm4, (%r9,%rax,4)
- movaps %xmm2, (%r10,%rax,4)
- subps %xmm13, %xmm1
- addps %xmm13, %xmm6
- movaps %xmm1, (%r11,%rax,4)
- movaps %xmm0, (%r12,%rax,4)
- movaps %xmm14, (%r13,%rax,4)
- movaps %xmm12, (%r14,%rax,4)
- movaps %xmm6, (%r15,%rax,4)
- addq $4, %rax
- cmpq %rcx, %rax
+ xorps %xmm3, %xmm11 #const
+ movaps 48(%rsi), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm5
+ mulps %xmm13, %xmm15
+ movaps 64(%rsi), %xmm10
+ movaps %xmm5, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+ movaps (%r13,%rax,4), %xmm7
+ movaps %xmm10, %xmm13
+ movaps (%r15,%rax,4), %xmm8
+ movaps %xmm6, %xmm12
+ movaps 80(%rsi), %xmm9
+ addq $96, %rsi
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm5
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm3, %xmm6 #const
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+ movaps (%r9,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm3, %xmm13 #const
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+ movaps %xmm5, (%rbx,%rax,4)
+ movaps %xmm4, (%r9,%rax,4)
+ movaps %xmm2, (%r10,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+ movaps %xmm1, (%r11,%rax,4)
+ movaps %xmm0, (%r12,%rax,4)
+ movaps %xmm14, (%r13,%rax,4)
+ movaps %xmm12, (%r14,%rax,4)
+ movaps %xmm6, (%r15,%rax,4)
+ addq $4, %rax
+ cmpq %rcx, %rax
jne X8_soft_loop
- ret
+ ret
#ifdef __APPLE__
- .globl _x8_hard
+ .globl _x8_hard
_x8_hard:
#else
- .globl x8_hard
+ .globl x8_hard
x8_hard:
#endif
- movaps (%r9), %xmm5
-X8_loop:
- movaps (%r8), %xmm9
+ movaps (%r9), %xmm5
+X8_loop:
+ movaps (%r8), %xmm9
X8_const_2:
- movaps 0xFECA(%rdx,%rax,4), %xmm6
- movaps %xmm9, %xmm11
+ movaps 0xFECA(%rdx,%rax,4), %xmm6
+ movaps %xmm9, %xmm11
X8_const_3:
- movaps 0xFECA(%rdx,%rax,4), %xmm7
- movaps 16(%r8), %xmm8
- mulps %xmm6, %xmm11
- mulps %xmm7, %xmm9
- shufps $177, %xmm6, %xmm6
- mulps %xmm8, %xmm6
- shufps $177, %xmm7, %xmm7
- subps %xmm6, %xmm11
- mulps %xmm7, %xmm8
- movaps %xmm11, %xmm10
- addps %xmm8, %xmm9
- movaps 32(%r8), %xmm15
- addps %xmm9, %xmm10
- subps %xmm9, %xmm11
+ movaps 0xFECA(%rdx,%rax,4), %xmm7
+ movaps 16(%r8), %xmm8
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+ movaps 32(%r8), %xmm15
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
X8_const_0:
- movaps 0xFECA(%rdx,%rax,4), %xmm3
- movaps %xmm15, %xmm6
+ movaps 0xFECA(%rdx,%rax,4), %xmm3
+ movaps %xmm15, %xmm6
X8_const_4:
movaps 0xFECA(%rdx,%rax,4), %xmm12
- movaps %xmm3, %xmm2
+ movaps %xmm3, %xmm2
X8_const_6:
movaps 0xFECA(%rdx,%rax,4), %xmm13
- xorps %xmm5, %xmm11
- movaps 48(%r8), %xmm14
- subps %xmm10, %xmm2
- mulps %xmm12, %xmm6
- addps %xmm10, %xmm3
- mulps %xmm13, %xmm15
- movaps 64(%r8), %xmm10
- movaps %xmm3, %xmm0
- shufps $177, %xmm12, %xmm12
- shufps $177, %xmm13, %xmm13
- mulps %xmm14, %xmm12
- mulps %xmm13, %xmm14
- subps %xmm12, %xmm6
- addps %xmm14, %xmm15
+ xorps %xmm5, %xmm11
+ movaps 48(%r8), %xmm14
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm3
+ mulps %xmm13, %xmm15
+ movaps 64(%r8), %xmm10
+ movaps %xmm3, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
X8_const_5:
movaps 0xFECA(%rdx,%rax,4), %xmm7
- movaps %xmm10, %xmm13
+ movaps %xmm10, %xmm13
X8_const_7:
movaps 0xFECA(%rdx,%rax,4), %xmm8
- movaps %xmm6, %xmm12
- movaps 80(%r8), %xmm9
- addq $96, %r8
- mulps %xmm7, %xmm13
- subps %xmm15, %xmm6
- addps %xmm15, %xmm12
- mulps %xmm8, %xmm10
- subps %xmm12, %xmm0
- addps %xmm12, %xmm3
- shufps $177, %xmm7, %xmm7
- xorps %xmm5, %xmm6
- shufps $177, %xmm8, %xmm8
- movaps %xmm2, %xmm12
- mulps %xmm9, %xmm7
- mulps %xmm8, %xmm9
- subps %xmm7, %xmm13
- addps %xmm9, %xmm10
+ movaps %xmm6, %xmm12
+ movaps 80(%r8), %xmm9
+ addq $96, %r8
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm3
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm5, %xmm6
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
X8_const_1:
- movaps 0xFECA(%rdx,%rax,4), %xmm4
- shufps $177, %xmm11, %xmm11
- movaps %xmm4, %xmm1
- shufps $177, %xmm6, %xmm6
- addps %xmm11, %xmm1
- subps %xmm11, %xmm4
- addps %xmm6, %xmm12
- subps %xmm6, %xmm2
- movaps %xmm13, %xmm11
- movaps %xmm4, %xmm14
- movaps %xmm1, %xmm6
- subps %xmm10, %xmm13
- addps %xmm10, %xmm11
- xorps %xmm5, %xmm13
- addps %xmm11, %xmm4
- subps %xmm11, %xmm14
- shufps $177, %xmm13, %xmm13
+ movaps 0xFECA(%rdx,%rax,4), %xmm4
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm5, %xmm13
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
X8_const1_0:
movaps %xmm3, 0xFECA(%rdx,%rax,4)
X8_const1_1:
movaps %xmm4, 0xFECA(%rdx,%rax,4)
X8_const1_2:
- movaps %xmm2, 0xFECA(%rdx,%rax,4)
- subps %xmm13, %xmm1
- addps %xmm13, %xmm6
+ movaps %xmm2, 0xFECA(%rdx,%rax,4)
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
X8_const1_3:
- movaps %xmm1, 0xFECA(%rdx,%rax,4)
+ movaps %xmm1, 0xFECA(%rdx,%rax,4)
X8_const1_4:
movaps %xmm0, 0xFECA(%rdx,%rax,4)
X8_const1_5:
movaps %xmm14, 0xFECA(%rdx,%rax,4)
X8_const1_6:
- movaps %xmm12, 0xFECA(%rdx,%rax,4)
+ movaps %xmm12, 0xFECA(%rdx,%rax,4)
X8_const1_7:
movaps %xmm6, 0xFECA(%rdx,%rax,4)
- addq $4, %rax
- cmpq %rcx, %rax
+ addq $4, %rax
+ cmpq %rcx, %rax
jne X8_loop
-#ifdef __APPLE__
- .globl _sse_leaf_ee_offsets
- .globl _sse_leaf_oo_offsets
- .globl _sse_leaf_eo_offsets
- .globl _sse_leaf_oe_offsets
- .align 4
+#ifdef __APPLE__
+ .globl _sse_leaf_ee_offsets
+ .globl _sse_leaf_oo_offsets
+ .globl _sse_leaf_eo_offsets
+ .globl _sse_leaf_oe_offsets
+ .align 4
_sse_leaf_ee_offsets:
- .long LEAF_EE_const_0-_leaf_ee+0x4
- .long LEAF_EE_const_1-_leaf_ee+0x5
- .long LEAF_EE_const_2-_leaf_ee+0x5
- .long LEAF_EE_const_3-_leaf_ee+0x5
- .long LEAF_EE_const_4-_leaf_ee+0x5
- .long LEAF_EE_const_5-_leaf_ee+0x5
- .long LEAF_EE_const_6-_leaf_ee+0x4
- .long LEAF_EE_const_7-_leaf_ee+0x5
+ .long LEAF_EE_const_0-_leaf_ee+0x4
+ .long LEAF_EE_const_1-_leaf_ee+0x5
+ .long LEAF_EE_const_2-_leaf_ee+0x5
+ .long LEAF_EE_const_3-_leaf_ee+0x5
+ .long LEAF_EE_const_4-_leaf_ee+0x5
+ .long LEAF_EE_const_5-_leaf_ee+0x5
+ .long LEAF_EE_const_6-_leaf_ee+0x4
+ .long LEAF_EE_const_7-_leaf_ee+0x5
_sse_leaf_oo_offsets:
- .long LEAF_OO_const_0-_leaf_oo+0x4
- .long LEAF_OO_const_1-_leaf_oo+0x4
- .long LEAF_OO_const_2-_leaf_oo+0x5
- .long LEAF_OO_const_3-_leaf_oo+0x5
- .long LEAF_OO_const_4-_leaf_oo+0x4
- .long LEAF_OO_const_5-_leaf_oo+0x5
- .long LEAF_OO_const_6-_leaf_oo+0x5
- .long LEAF_OO_const_7-_leaf_oo+0x5
+ .long LEAF_OO_const_0-_leaf_oo+0x4
+ .long LEAF_OO_const_1-_leaf_oo+0x4
+ .long LEAF_OO_const_2-_leaf_oo+0x5
+ .long LEAF_OO_const_3-_leaf_oo+0x5
+ .long LEAF_OO_const_4-_leaf_oo+0x4
+ .long LEAF_OO_const_5-_leaf_oo+0x5
+ .long LEAF_OO_const_6-_leaf_oo+0x5
+ .long LEAF_OO_const_7-_leaf_oo+0x5
_sse_leaf_eo_offsets:
- .long LEAF_EO_const_0-_leaf_eo+0x5
- .long LEAF_EO_const_1-_leaf_eo+0x4
- .long LEAF_EO_const_2-_leaf_eo+0x4
- .long LEAF_EO_const_3-_leaf_eo+0x4
- .long LEAF_EO_const_4-_leaf_eo+0x5
- .long LEAF_EO_const_5-_leaf_eo+0x5
- .long LEAF_EO_const_6-_leaf_eo+0x4
- .long LEAF_EO_const_7-_leaf_eo+0x5
+ .long LEAF_EO_const_0-_leaf_eo+0x5
+ .long LEAF_EO_const_1-_leaf_eo+0x4
+ .long LEAF_EO_const_2-_leaf_eo+0x4
+ .long LEAF_EO_const_3-_leaf_eo+0x4
+ .long LEAF_EO_const_4-_leaf_eo+0x5
+ .long LEAF_EO_const_5-_leaf_eo+0x5
+ .long LEAF_EO_const_6-_leaf_eo+0x4
+ .long LEAF_EO_const_7-_leaf_eo+0x5
_sse_leaf_oe_offsets:
- .long LEAF_OE_const_0-_leaf_oe+0x5
- .long LEAF_OE_const_1-_leaf_oe+0x4
- .long LEAF_OE_const_2-_leaf_oe+0x4
- .long LEAF_OE_const_3-_leaf_oe+0x5
- .long LEAF_OE_const_4-_leaf_oe+0x5
- .long LEAF_OE_const_5-_leaf_oe+0x5
- .long LEAF_OE_const_6-_leaf_oe+0x4
- .long LEAF_OE_const_7-_leaf_oe+0x4
+ .long LEAF_OE_const_0-_leaf_oe+0x5
+ .long LEAF_OE_const_1-_leaf_oe+0x4
+ .long LEAF_OE_const_2-_leaf_oe+0x4
+ .long LEAF_OE_const_3-_leaf_oe+0x5
+ .long LEAF_OE_const_4-_leaf_oe+0x5
+ .long LEAF_OE_const_5-_leaf_oe+0x5
+ .long LEAF_OE_const_6-_leaf_oe+0x4
+ .long LEAF_OE_const_7-_leaf_oe+0x4
#else
- .globl sse_leaf_ee_offsets
- .globl sse_leaf_oo_offsets
- .globl sse_leaf_eo_offsets
- .globl sse_leaf_oe_offsets
- .align 4
+ .globl sse_leaf_ee_offsets
+ .globl sse_leaf_oo_offsets
+ .globl sse_leaf_eo_offsets
+ .globl sse_leaf_oe_offsets
+ .align 4
sse_leaf_ee_offsets:
- .long LEAF_EE_const_0-leaf_ee+0x4
- .long LEAF_EE_const_1-leaf_ee+0x5
- .long LEAF_EE_const_2-leaf_ee+0x5
- .long LEAF_EE_const_3-leaf_ee+0x5
- .long LEAF_EE_const_4-leaf_ee+0x5
- .long LEAF_EE_const_5-leaf_ee+0x5
- .long LEAF_EE_const_6-leaf_ee+0x4
- .long LEAF_EE_const_7-leaf_ee+0x5
+ .long LEAF_EE_const_0-leaf_ee+0x4
+ .long LEAF_EE_const_1-leaf_ee+0x5
+ .long LEAF_EE_const_2-leaf_ee+0x5
+ .long LEAF_EE_const_3-leaf_ee+0x5
+ .long LEAF_EE_const_4-leaf_ee+0x5
+ .long LEAF_EE_const_5-leaf_ee+0x5
+ .long LEAF_EE_const_6-leaf_ee+0x4
+ .long LEAF_EE_const_7-leaf_ee+0x5
sse_leaf_oo_offsets:
- .long LEAF_OO_const_0-leaf_oo+0x4
- .long LEAF_OO_const_1-leaf_oo+0x4
- .long LEAF_OO_const_2-leaf_oo+0x5
- .long LEAF_OO_const_3-leaf_oo+0x5
- .long LEAF_OO_const_4-leaf_oo+0x4
- .long LEAF_OO_const_5-leaf_oo+0x5
- .long LEAF_OO_const_6-leaf_oo+0x5
- .long LEAF_OO_const_7-leaf_oo+0x5
+ .long LEAF_OO_const_0-leaf_oo+0x4
+ .long LEAF_OO_const_1-leaf_oo+0x4
+ .long LEAF_OO_const_2-leaf_oo+0x5
+ .long LEAF_OO_const_3-leaf_oo+0x5
+ .long LEAF_OO_const_4-leaf_oo+0x4
+ .long LEAF_OO_const_5-leaf_oo+0x5
+ .long LEAF_OO_const_6-leaf_oo+0x5
+ .long LEAF_OO_const_7-leaf_oo+0x5
sse_leaf_eo_offsets:
- .long LEAF_EO_const_0-leaf_eo+0x5
- .long LEAF_EO_const_1-leaf_eo+0x4
- .long LEAF_EO_const_2-leaf_eo+0x4
- .long LEAF_EO_const_3-leaf_eo+0x4
- .long LEAF_EO_const_4-leaf_eo+0x5
- .long LEAF_EO_const_5-leaf_eo+0x5
- .long LEAF_EO_const_6-leaf_eo+0x4
- .long LEAF_EO_const_7-leaf_eo+0x5
+ .long LEAF_EO_const_0-leaf_eo+0x5
+ .long LEAF_EO_const_1-leaf_eo+0x4
+ .long LEAF_EO_const_2-leaf_eo+0x4
+ .long LEAF_EO_const_3-leaf_eo+0x4
+ .long LEAF_EO_const_4-leaf_eo+0x5
+ .long LEAF_EO_const_5-leaf_eo+0x5
+ .long LEAF_EO_const_6-leaf_eo+0x4
+ .long LEAF_EO_const_7-leaf_eo+0x5
sse_leaf_oe_offsets:
- .long LEAF_OE_const_0-leaf_oe+0x5
- .long LEAF_OE_const_1-leaf_oe+0x4
- .long LEAF_OE_const_2-leaf_oe+0x4
- .long LEAF_OE_const_3-leaf_oe+0x5
- .long LEAF_OE_const_4-leaf_oe+0x5
- .long LEAF_OE_const_5-leaf_oe+0x5
- .long LEAF_OE_const_6-leaf_oe+0x4
- .long LEAF_OE_const_7-leaf_oe+0x4
+ .long LEAF_OE_const_0-leaf_oe+0x5
+ .long LEAF_OE_const_1-leaf_oe+0x4
+ .long LEAF_OE_const_2-leaf_oe+0x4
+ .long LEAF_OE_const_3-leaf_oe+0x5
+ .long LEAF_OE_const_4-leaf_oe+0x5
+ .long LEAF_OE_const_5-leaf_oe+0x5
+ .long LEAF_OE_const_6-leaf_oe+0x4
+ .long LEAF_OE_const_7-leaf_oe+0x4
#endif
#ifdef __APPLE__
- .data
+ .data
#else
- .section .data
+ .section .data
#endif
- .p2align 4
-#ifdef __APPLE__
- .globl _sse_constants
+ .p2align 4
+#ifdef __APPLE__
+ .globl _sse_constants
_sse_constants:
#else
- .globl sse_constants
+ .globl sse_constants
sse_constants:
#endif
- .long 0x00000000,0x80000000,0x00000000,0x80000000
- .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
- .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
- .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
- .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
-#ifdef __APPLE__
- .globl _sse_constants_inv
+ .long 0x00000000,0x80000000,0x00000000,0x80000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+ .globl _sse_constants_inv
_sse_constants_inv:
#else
- .globl sse_constants_inv
+ .globl sse_constants_inv
sse_constants_inv:
#endif
- .long 0x80000000,0x00000000,0x80000000,0x00000000
- .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
- .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
- .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
- .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
+ .long 0x80000000,0x00000000,0x80000000,0x00000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
OpenPOWER on IntegriCloud