summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am2
-rw-r--r--src/Makefile.in10
-rw-r--r--src/codegen.c88
-rw-r--r--src/ffts.c74
-rw-r--r--src/ffts.h1
-rw-r--r--src/vfp.h45
-rw-r--r--src/vfp.s483
7 files changed, 666 insertions, 37 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 5f6db47..69a8559 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -18,7 +18,7 @@ if HAVE_NEON
if DYNAMIC_DISABLED
libffts_la_SOURCES += neon_static_f.s neon_static_i.s
else
-libffts_la_SOURCES += neon.s
+libffts_la_SOURCES += neon.s vfp.s
endif
else
diff --git a/src/Makefile.in b/src/Makefile.in
index 1de271a..4e8661e 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -54,7 +54,7 @@ host_triplet = @host@
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__append_3 = neon_static_f.s neon_static_i.s
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s vfp.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__append_5 = sse.s
subdir = src
DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
@@ -100,13 +100,13 @@ LTLIBRARIES = $(lib_LTLIBRARIES)
libffts_la_LIBADD =
am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
ffts_real_nd.c patterns.c ffts_static.c codegen.c \
- neon_static_f.s neon_static_i.s neon.s sse.s
+ neon_static_f.s neon_static_i.s neon.s vfp.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__objects_3 = \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_i.lo
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo vfp.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__objects_5 = sse.lo
am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
@@ -279,9 +279,9 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
$(am__cd) $(top_srcdir) && \
- $(AUTOMAKE) --foreign src/Makefile
+ $(AUTOMAKE) --gnu src/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
diff --git a/src/codegen.c b/src/codegen.c
index 4e0b633..21f8be0 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -46,6 +46,7 @@
#include "codegen_neon.h"
// #include "neon_float.h"
#include "neon.h"
+ #include "vfp.h"
#else
#include "codegen_sse.h"
#include "sse_float.h"
@@ -201,6 +202,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
insns_t *x_8_addr = fp;
+#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
if(sign < 0) {
@@ -210,6 +212,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_x8_t - neon_x8) / 4;
#else
+ memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+ fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
align_mem16(&fp, 0);
x_8_addr = fp;
align_mem16(&fp, 5);
@@ -221,6 +227,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
insns_t *x_4_addr = fp;
+#ifdef __arm__
+
#ifdef __ARM_NEON__
memcpy(fp, neon_x4, neon_x8 - neon_x4);
if(sign < 0) {
@@ -228,6 +236,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_x8 - neon_x4) / 4;
#else
+ memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+ fp += (vfp_x8 - vfp_x4) / 4;
+#endif
+#else
align_mem16(&fp, 0);
x_4_addr = fp;
memcpy(fp, x4, x8_soft - x4);
@@ -257,9 +269,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
#endif
-#ifdef __ARM_NEON__
- *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
- MOVI(&fp, 11, p->i0);
+#ifdef __arm__
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ #ifdef __ARM_NEON__
+ MOVI(&fp, 11, p->i0);
+ #else
+ MOVI(&fp, 11, p->i0);
+ #endif
+
#else
align_mem16(&fp, 0);
start = fp;
@@ -273,15 +290,20 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
#endif
//fp++;
+#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_ee, neon_oo - neon_ee);
- if(sign < 0) {
- fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
- fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
+ if(sign < 0) {
+ fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
fp += (neon_oo - neon_ee) / 4;
#else
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ fp += (vfp_o - vfp_e) / 4;
+#endif
+#else
//fprintf(stderr, "Body start address = %016p\n", start);
PUSH(&fp, RBP);
@@ -403,14 +425,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
if(pps[0] == 2*leafN) {
- CALL(&fp, x_4_addr);
+ // CALL(&fp, x_4_addr);
// }else if(!pps[2]){
// //uint32_t *x_8_t_addr = fp;
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
// fp += (neon_ee - neon_x8_t) / 4;
// //*fp++ = BL(fp+2, x_8_t_addr);
}else{
- CALL(&fp, x_8_addr);
+ // CALL(&fp, x_8_addr);
}
pAddr = pps[1] * 4;
@@ -422,6 +444,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
pps += 2;
}
#endif
+#ifdef __arm__
#ifdef __ARM_NEON__
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
@@ -519,7 +542,45 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
fp += (neon_oo - neon_ee) / 4;
}
+#else
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+ MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+ memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+ fp += (vfp_x4 - vfp_o) / 4;
+
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ fp += (vfp_o - vfp_e) / 4;
+
+#endif
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
//ADDI(&fp, 2, 1, 0);
MOVI(&fp, 1, 0);
@@ -551,6 +612,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
*fp = BL(fp+2, x_4_addr); fp++;
}else if(!pps[2]){
//uint32_t *x_8_t_addr = fp;
+#ifdef __ARM_NEON__
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
@@ -559,6 +621,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_ee - neon_x8_t) / 4;
//*fp++ = BL(fp+2, x_8_t_addr);
+
+#else
+ *fp = BL(fp+2, x_8_addr); fp++;
+#endif
}else{
*fp = BL(fp+2, x_8_addr); fp++;
}
@@ -612,7 +678,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
exit(1);
}
#ifdef __APPLE__
-// sys_icache_invalidate(func, p->transform_size);
+ sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
diff --git a/src/ffts.c b/src/ffts.c
index 320db2c..481a66e 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -104,13 +104,30 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
if(N >= 32) {
ffts_init_offsets(p, N, leafN);
- ffts_init_is(p, N, leafN, 2);
+#ifdef __arm__
+#ifdef __ARM_NEON__
+ ffts_init_is(p, N, leafN, 1);
+#else
+ ffts_init_is(p, N, leafN, 1);
+#endif
+#else
+ ffts_init_is(p, N, leafN, 1);
+#endif
p->i0 = N/leafN/3+1;
p->i1 = N/leafN/3;
if((N/leafN) % 3 > 1) p->i1++;
- p->i0/=2;
- p->i1/=2;
+ p->i2 = N/leafN/3;
+
+ #ifdef __arm__
+ #ifdef __ARM_NEON__
+ p->i0/=2;
+ p->i1/=2;
+ #endif
+ #else
+ p->i0/=2;
+ p->i1/=2;
+ #endif
}else{
p->transforms = malloc(2 * sizeof(transform_index_t));
@@ -198,7 +215,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
float *fw0 = (float *)w0;
- #ifdef __ARM_NEON__
+ #ifdef __arm__
if(N < 32) {
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
float *fw = (float *)w;
@@ -217,11 +234,18 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;
+ #ifdef __ARM_NEON__
for(j=0;j<n/4;j+=4) {
- temp0 = VLD2(fw0 + j*2);
- temp0.val[1] = VXOR(temp0.val[1], neg);
- STORESPR(fw + j*2, temp0);
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2, temp0);
+ }
+ #else
+ for(j=0;j<n/4;j+=1) {
+ fw[j*2] = fw0[j*2];
+ fw[j*2+1] = fw0[j*2+1];
}
+ #endif
w += n/4;
}
#else
@@ -261,22 +285,32 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
float *fw0 = (float *)w0;
float *fw1 = (float *)w1;
float *fw2 = (float *)w2;
- #ifdef __ARM_NEON__
+ #ifdef __arm__
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;
-
- for(j=0;j<n/8;j+=4) {
- temp0 = VLD2(fw0 + j*2);
- temp0.val[1] = VXOR(temp0.val[1], neg);
- STORESPR(fw + j*2*3, temp0);
- temp1 = VLD2(fw1 + j*2);
- temp1.val[1] = VXOR(temp1.val[1], neg);
- STORESPR(fw + j*2*3 + 8, temp1);
- temp2 = VLD2(fw2 + j*2);
- temp2.val[1] = VXOR(temp2.val[1], neg);
- STORESPR(fw + j*2*3 + 16, temp2);
- }
+ #ifdef __ARM_NEON__
+ for(j=0;j<n/8;j+=4) {
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2*3, temp0);
+ temp1 = VLD2(fw1 + j*2);
+ temp1.val[1] = VXOR(temp1.val[1], neg);
+ STORESPR(fw + j*2*3 + 8, temp1);
+ temp2 = VLD2(fw2 + j*2);
+ temp2.val[1] = VXOR(temp2.val[1], neg);
+ STORESPR(fw + j*2*3 + 16, temp2);
+ }
+ #else
+ for(j=0;j<n/8;j+=1) {
+ fw[j*6] = fw0[j*2];
+ fw[j*6+1] = fw0[j*2+1];
+ fw[j*6+2] = fw1[j*2+0];
+ fw[j*6+3] = fw1[j*2+1];
+ fw[j*6+4] = fw2[j*2+0];
+ fw[j*6+5] = fw2[j*2+1];
+ }
+ #endif
w += n/8 * 3;
#else
//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
diff --git a/src/ffts.h b/src/ffts.h
index 4ebb66f..26c2550 100644
--- a/src/ffts.h
+++ b/src/ffts.h
@@ -94,6 +94,7 @@ struct _ffts_plan_t {
void (*destroy)(ffts_plan_t *);
float *A, *B;
+ size_t i2;
};
diff --git a/src/vfp.h b/src/vfp.h
new file mode 100644
index 0000000..f733a3f
--- /dev/null
+++ b/src/vfp.h
@@ -0,0 +1,45 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __VFP_H__
+#define __VFP_H__
+
+#include "ffts.h"
+
+void vfp_e();
+void vfp_o();
+void vfp_x4();
+void vfp_x8();
+void vfp_end();
+
+#endif
diff --git a/src/vfp.s b/src/vfp.s
new file mode 100644
index 0000000..cd865b8
--- /dev/null
+++ b/src/vfp.s
@@ -0,0 +1,483 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+@ assumes r0 = out
+@ r1 = in ?
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 = const pointer
+@ & lr = temps
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_e
+_vfp_e:
+#else
+ .globl vfp_e
+vfp_e:
+#endif
+_vfp_e_loop:
+ vldr s9, [r8] @ x5
+ vldr s18, [r4] @ x1
+ vldr s12, [r8, #4]
+ vldr s7, [r2]
+ vldr s4, [r9] @ x6
+ vldr s14, [r9, #4]
+ vldr s16, [r2, #8]
+ vldr s6, [r10, #4]
+ vldr s0, [r10] @ x7
+ vldr s20, [r3] @ x0
+ vldr s11, [r7] @ x4
+ vldr s5, [r7, #4]
+ vsub.f32 s8, s4, s0
+ vsub.f32 s1, s14, s6
+ vsub.f32 s10, s5, s12
+ vsub.f32 s3, s11, s9
+ vldr s30, [r5, #4]
+ vldr s26, [r6, #4]
+ vadd.f32 s0, s4, s0
+ vadd.f32 s4, s11, s9
+ vldr s24, [r6] @ x3
+ vldr s28, [r5] @ x2
+ vadd.f32 s14, s14, s6
+ vadd.f32 s5, s5, s12
+ vadd.f32 s2, s20, s18
+ vsub.f32 s6, s20, s18
+ vmul.f32 s9, s3, s7
+ vmul.f32 s11, s3, s16
+ vmul.f32 s15, s8, s7
+ vmul.f32 s13, s10, s16
+ vmul.f32 s20, s10, s7
+ vmul.f32 s22, s1, s16
+ vadd.f32 s12, s30, s26
+ vsub.f32 s3, s30, s26
+ vmul.f32 s26, s8, s16
+ vldr s16, [r4, #4]
+ vmul.f32 s30, s1, s7
+ vldr s7, [r3, #4]
+ add r3, r3, #8
+ add r4, r4, #8
+ add r5, r5, #8
+ add r6, r6, #8
+ add r7, r7, #8
+ add r8, r8, #8
+ add r9, r9, #8
+ add r10, r10, #8
+ vsub.f32 s10, s9, s13
+ vadd.f32 s18, s28, s24
+ vadd.f32 s8, s15, s22
+ vadd.f32 s22, s7, s16
+ vsub.f32 s13, s28, s24
+ vsub.f32 s16, s7, s16
+ vadd.f32 s28, s20, s11
+ vsub.f32 s26, s30, s26
+ vadd.f32 s1, s4, s0
+ vadd.f32 s9, s2, s18
+ vsub.f32 s11, s2, s18
+ vadd.f32 s15, s22, s12
+ vadd.f32 s7, s5, s14
+ vsub.f32 s18, s5, s14
+ vsub.f32 s5, s4, s0
+ vsub.f32 s22, s22, s12
+ vadd.f32 s12, s10, s8
+ vadd.f32 s2, s6, s3
+ vsub.f32 s0, s6, s3
+ vsub.f32 s8, s10, s8
+ vsub.f32 s4, s16, s13
+ vadd.f32 s14, s16, s13
+ vadd.f32 s3, s28, s26
+ vsub.f32 s6, s28, s26
+ vadd.f32 s20, s9, s1
+ vsub.f32 s16, s9, s1
+ vadd.f32 s10, s2, s12
+ vadd.f32 s24, s15, s7
+ vsub.f32 s13, s15, s7
+ vadd.f32 s28, s11, s18
+ vsub.f32 s7, s11, s18
+ vadd.f32 s9, s0, s6
+ldr lr, [r12], #4
+add lr, r0, lr, lsl #2
+subs r11, r11, #1
+ vstr s20, [lr]
+ vsub.f32 s6, s0, s6
+ vsub.f32 s26, s22, s5
+ vadd.f32 s1, s22, s5
+ vstr s24, [lr, #4]
+ vadd.f32 s5, s4, s3
+ vsub.f32 s11, s14, s8
+ vstr s28, [lr, #16]
+ vsub.f32 s2, s2, s12
+ vsub.f32 s4, s4, s3
+ vadd.f32 s0, s14, s8
+ vstr s26, [lr, #20]
+ vstr s16, [lr, #32]
+ vstr s13, [lr, #36]
+ vstr s7, [lr, #48]
+ vstr s1, [lr, #52]
+ vstr s10, [lr, #8]
+ vstr s5, [lr, #12]
+ vstr s9, [lr, #24]
+ vstr s11, [lr, #28]
+ vstr s2, [lr, #40]
+ vstr s4, [lr, #44]
+ vstr s6, [lr, #56]
+ vstr s0, [lr, #60]
+ bne _vfp_e_loop
+
+@ assumes r0 = out
+@ r1 = in ?
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_o
+_vfp_o:
+#else
+ .globl vfp_o
+vfp_o:
+#endif
+ _vfp_o_loop:
+ vldr s2, [r4]
+ vldr s0, [r4, #4]
+ vldr s12, [r5]
+ vldr s4, [r6]
+ vldr s14, [r6, #4]
+ vldr s3, [r5, #4]
+ vadd.f32 s6, s12, s4
+ vldr s7, [r3]
+ vldr s5, [r3, #4]
+ subs r11, r11, #1
+ ldr r2, [r12], #4
+ add r2, r0, r2, lsl #2
+ vadd.f32 s8, s3, s14
+ vadd.f32 s10, s7, s2
+ vadd.f32 s1, s5, s0
+ vsub.f32 s14, s3, s14
+ vsub.f32 s3, s7, s2
+ vsub.f32 s2, s12, s4
+ vsub.f32 s0, s5, s0
+ vadd.f32 s9, s10, s6
+ vsub.f32 s10, s10, s6
+ vadd.f32 s7, s1, s8
+ vadd.f32 s5, s3, s14
+ vsub.f32 s4, s3, s14
+ vldr s3, [r9, #4]
+ vsub.f32 s12, s0, s2
+ vadd.f32 s0, s0, s2
+ vldr s2, [r8]
+ vsub.f32 s6, s1, s8
+ vstr s9, [r2]
+ vstr s7, [r2, #4]
+ vstr s5, [r2, #8]
+ vstr s12, [r2, #12]
+ vstr s10, [r2, #16]
+ vstr s6, [r2, #20]
+ vstr s4, [r2, #24]
+ vstr s0, [r2, #28]
+ vldr s0, [r8, #4]
+ vldr s12, [r9]
+ vldr s4, [r10]
+ vldr s14, [r10, #4]
+ vldr s7, [r7]
+ vadd.f32 s6, s12, s4
+ vldr s5, [r7, #4]
+ add r3, r3, #8
+ add r4, r4, #8
+ add r5, r5, #8
+ add r6, r6, #8
+ add r7, r7, #8
+ add r8, r8, #8
+ add r9, r9, #8
+ add r10, r10, #8
+ vadd.f32 s8, s3, s14
+ vadd.f32 s10, s7, s2
+ vadd.f32 s1, s5, s0
+ vsub.f32 s14, s3, s14
+ vsub.f32 s3, s7, s2
+ vsub.f32 s2, s12, s4
+ vsub.f32 s0, s5, s0
+ vadd.f32 s9, s10, s6
+ vsub.f32 s10, s10, s6
+ vadd.f32 s7, s1, s8
+ vadd.f32 s5, s3, s14
+ vsub.f32 s6, s1, s8
+ vsub.f32 s12, s0, s2
+ vsub.f32 s4, s3, s14
+ vadd.f32 s0, s0, s2
+ vstr s9, [r2, #32]
+ vstr s7, [r2, #36]
+ vstr s5, [r2, #40]
+ vstr s12, [r2, #44]
+ vstr s10, [r2, #48]
+ vstr s6, [r2, #52]
+ vstr s4, [r2, #56]
+ vstr s0, [r2, #60]
+ bne _vfp_o_loop
+
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_x4
+_vfp_x4:
+#else
+ .globl vfp_x4
+vfp_x4:
+#endif
+ add r3, r0, #0
+ add r7, r2, #0
+ add r4, r0, r1, lsl #1
+ add r5, r0, r1, lsl #2
+ add r6, r4, r1, lsl #2
+ mov r11, #4
+_vfp_x4_loop:
+
+ vldr s8, [r3, #0]
+ vldr s9, [r3, #4]
+ vldr s10, [r4, #0]
+ vldr s11, [r4, #4]
+ vldr s12, [r5, #0]
+ vldr s13, [r5, #4]
+ vldr s14, [r6, #0]
+ vldr s15, [r6, #4]
+ vldr s2, [r7, #0]
+ vldr s3, [r7, #4]
+ add r7, r7, #8
+ subs r11, r11, #1
+ vmul.f32 s0, s13, s3
+ vmul.f32 s5, s12, s2
+ vmul.f32 s1, s14, s2
+ vmul.f32 s4, s14, s3
+ vmul.f32 s14, s12, s3
+ vmul.f32 s13, s13, s2
+ vmul.f32 s12, s15, s3
+ vmul.f32 s2, s15, s2
+ vsub.f32 s0, s5, s0
+ vadd.f32 s13, s13, s14
+ vadd.f32 s12, s12, s1
+ vsub.f32 s1, s2, s4
+ vadd.f32 s15, s0, s12
+ vsub.f32 s12, s0, s12
+ vadd.f32 s14, s13, s1
+ vsub.f32 s13, s13, s1
+ vadd.f32 s0, s8, s15
+ vadd.f32 s1, s9, s14
+ vadd.f32 s2, s10, s13 @
+ vsub.f32 s4, s8, s15
+ vsub.f32 s3, s11, s12 @
+ vstr s0, [r3, #0]
+ vstr s1, [r3, #4]
+ add r3, r3, #8
+ vsub.f32 s5, s9, s14
+ vsub.f32 s6, s10, s13 @
+ vadd.f32 s7, s11, s12 @
+ vstr s2, [r4, #0]
+ vstr s3, [r4, #4]
+ add r4, r4, #8
+ vstr s4, [r5, #0]
+ vstr s5, [r5, #4]
+ add r5, r5, #8
+ vstr s6, [r6, #0]
+ vstr s7, [r6, #4]
+ add r6, r6, #8
+ bne _vfp_x4_loop
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_x8
+_vfp_x8:
+#else
+ .globl vfp_x8
+vfp_x8:
+#endif
+ mov r11, #0
+ add r3, r0, #0 @ data0
+ add r5, r0, r1, lsl #1 @ data2
+ add r4, r0, r1 @ data1
+ add r7, r5, r1, lsl #1 @ data4
+ add r6, r5, r1 @ data3
+ add r9, r7, r1, lsl #1 @ data6
+ add r8, r7, r1 @ data5
+ add r10, r9, r1 @ data7
+ add r12, r2, #0 @ LUT
+
+ sub r11, r11, r1, lsr #3
+_vfp_x8_loop:
+ vldr s2, [r12, #0]
+ vldr s3, [r12, #4]
+ add r12, r12, #8
+ vldr s14, [r6, #0]
+ vldr s15, [r6, #4]
+ vldr s10, [r5, #0]
+ vldr s11, [r5, #4]
+ adds r11, r11, #1
+ vmul.f32 s12, s15, s2
+ vmul.f32 s8, s14, s3
+ vmul.f32 s13, s14, s2
+ vmul.f32 s9, s10, s3
+ vmul.f32 s1, s10, s2
+ vmul.f32 s0, s11, s2
+ vmul.f32 s14, s11, s3
+ vmul.f32 s15, s15, s3
+ vldr s2, [r12, #0]
+ vldr s3, [r12, #4]
+ add r12, r12, #8
+ vsub.f32 s10, s12, s8
+ vadd.f32 s11, s0, s9
+ vadd.f32 s8, s15, s13
+ vldr s12, [r4, #0]
+ vldr s13, [r4, #4]
+ vsub.f32 s9, s1, s14
+ vsub.f32 s15, s11, s10
+ vsub.f32 s14, s9, s8
+ vadd.f32 s4, s12, s15 @
+ vsub.f32 s6, s12, s15 @
+ vsub.f32 s5, s13, s14 @
+ vadd.f32 s7, s13, s14 @
+ vldr s14, [r9, #0]
+ vldr s15, [r9, #4]
+ vldr s12, [r7, #0]
+ vldr s13, [r7, #4]
+ vmul.f32 s1, s14, s2
+ vmul.f32 s0, s14, s3
+ vstr s4, [r4, #0]
+ vstr s5, [r4, #4]
+ vmul.f32 s14, s15, s3
+ vmul.f32 s4, s15, s2
+ vadd.f32 s15, s9, s8
+ vstr s6, [r6, #0]
+ vstr s7, [r6, #4]
+ vmul.f32 s8, s12, s3
+ vmul.f32 s5, s13, s3
+ vmul.f32 s12, s12, s2
+ vmul.f32 s9, s13, s2
+ vadd.f32 s14, s14, s1
+ vsub.f32 s13, s4, s0
+ vadd.f32 s0, s9, s8
+ vldr s8, [r3, #0]
+ vldr s9, [r3, #4]
+ vadd.f32 s1, s11, s10
+ vsub.f32 s12, s12, s5
+ vadd.f32 s11, s8, s15
+ vsub.f32 s8, s8, s15
+ vadd.f32 s2, s12, s14
+ vsub.f32 s10, s0, s13
+ vadd.f32 s15, s0, s13
+ vadd.f32 s13, s9, s1
+ vsub.f32 s9, s9, s1
+ vsub.f32 s12, s12, s14
+ vadd.f32 s0, s11, s2
+ vadd.f32 s1, s13, s15
+ vsub.f32 s4, s11, s2
+ vadd.f32 s2, s8, s10 @
+ vsub.f32 s3, s9, s12 @
+ vstr s0, [r3, #0]
+ vstr s1, [r3, #4]
+ add r3, r3, #8
+ vsub.f32 s5, s13, s15
+ vldr s14, [r10, #0]
+ vldr s15, [r10, #4]
+ vadd.f32 s7, s9, s12 @
+ vldr s12, [r8, #0]
+ vldr s13, [r8, #4]
+ vstr s2, [r5, #0]
+ vstr s3, [r5, #4]
+ add r5, r5, #8
+ vldr s2, [r12, #0]
+ vldr s3, [r12, #4]
+ add r12, r12, #8
+ vsub.f32 s6, s8, s10 @
+ vmul.f32 s8, s14, s2
+ vstr s4, [r7, #0]
+ vstr s5, [r7, #4]
+ add r7, r7, #8
+ vmul.f32 s10, s15, s3
+ vmul.f32 s9, s13, s3
+ vmul.f32 s11, s12, s2
+ vmul.f32 s14, s14, s3
+ vstr s6, [r9, #0]
+ vstr s7, [r9, #4]
+ add r9, r9, #8
+ vmul.f32 s15, s15, s2
+ vmul.f32 s12, s12, s3
+ vmul.f32 s13, s13, s2
+ vadd.f32 s10, s10, s8
+ vsub.f32 s11, s11, s9
+ vldr s8, [r4, #0]
+ vldr s9, [r4, #4]
+ vsub.f32 s14, s15, s14
+ vadd.f32 s15, s13, s12
+ vadd.f32 s13, s11, s10
+ vadd.f32 s12, s15, s14
+ vsub.f32 s15, s15, s14
+ vsub.f32 s14, s11, s10
+ vldr s10, [r6, #0]
+ vldr s11, [r6, #4]
+ vadd.f32 s0, s8, s13
+ vadd.f32 s1, s9, s12
+ vadd.f32 s2, s10, s15 @
+ vsub.f32 s3, s11, s14 @
+ vsub.f32 s4, s8, s13
+ vstr s0, [r4, #0]
+ vstr s1, [r4, #4]
+ add r4, r4, #8
+ vsub.f32 s5, s9, s12
+ vsub.f32 s6, s10, s15 @
+ vstr s2, [r6, #0]
+ vstr s3, [r6, #4]
+ add r6, r6, #8
+ vadd.f32 s7, s11, s14 @
+ vstr s4, [r8, #0]
+ vstr s5, [r8, #4]
+ add r8, r8, #8
+ vstr s6, [r10, #0]
+ vstr s7, [r10, #4]
+ add r10, r10, #8
+ bne _vfp_x8_loop
+ bx lr
+
+ .align 4
+#ifdef __APPLE__
+ .globl _vfp_end
+_vfp_end:
+#else
+ .globl vfp_end
+vfp_end:
+#endif
+ bx lr
OpenPOWER on IntegriCloud