From 17aaf9d6216cd7c608824a2b4fd0e735004e612e Mon Sep 17 00:00:00 2001 From: Anthony Blake Date: Wed, 3 Apr 2013 14:57:43 +1300 Subject: Added VFP support for ARM --- src/Makefile.am | 2 +- src/Makefile.in | 10 +- src/codegen.c | 88 +++++++++-- src/ffts.c | 74 ++++++--- src/ffts.h | 1 + src/vfp.h | 45 ++++++ src/vfp.s | 483 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 666 insertions(+), 37 deletions(-) create mode 100644 src/vfp.h create mode 100644 src/vfp.s (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index 5f6db47..69a8559 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -18,7 +18,7 @@ if HAVE_NEON if DYNAMIC_DISABLED libffts_la_SOURCES += neon_static_f.s neon_static_i.s else -libffts_la_SOURCES += neon.s +libffts_la_SOURCES += neon.s vfp.s endif else diff --git a/src/Makefile.in b/src/Makefile.in index 1de271a..4e8661e 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -54,7 +54,7 @@ host_triplet = @host@ @DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c @DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__append_3 = neon_static_f.s neon_static_i.s -@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s +@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s vfp.s @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__append_5 = sse.s subdir = src DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \ @@ -100,13 +100,13 @@ LTLIBRARIES = $(lib_LTLIBRARIES) libffts_la_LIBADD = am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \ ffts_real_nd.c patterns.c ffts_static.c codegen.c \ - neon_static_f.s neon_static_i.s neon.s sse.s + neon_static_f.s neon_static_i.s neon.s vfp.s sse.s @DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo @DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__objects_3 = \ @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_f.lo \ @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_i.lo -@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo +@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo vfp.lo @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__objects_5 = sse.lo am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \ ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \ @@ -279,9 +279,9 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) exit 1;; \ esac; \ done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \ $(am__cd) $(top_srcdir) && \ - $(AUTOMAKE) --foreign src/Makefile + $(AUTOMAKE) --gnu src/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ diff --git a/src/codegen.c b/src/codegen.c index 4e0b633..21f8be0 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -46,6 +46,7 @@ #include "codegen_neon.h" // #include "neon_float.h" #include "neon.h" + #include "vfp.h" #else #include "codegen_sse.h" #include "sse_float.h" @@ -201,6 +202,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { } insns_t *x_8_addr = fp; +#ifdef __arm__ #ifdef __ARM_NEON__ memcpy(fp, neon_x8, neon_x8_t - neon_x8); if(sign < 0) { @@ -210,6 +212,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { } fp += (neon_x8_t - neon_x8) / 4; #else + memcpy(fp, vfp_x8, vfp_end - vfp_x8); + fp += (vfp_end - vfp_x8) / 4; +#endif +#else align_mem16(&fp, 0); x_8_addr = fp; align_mem16(&fp, 5); @@ -221,6 +227,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); //fp += (neon_end - neon_x8_t) / 4; insns_t *x_4_addr = fp; +#ifdef __arm__ + #ifdef __ARM_NEON__ memcpy(fp, neon_x4, neon_x8 - neon_x4); if(sign < 0) { @@ -228,6 +236,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { } fp += (neon_x8 - neon_x4) / 4; #else + memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); + fp += (vfp_x8 - vfp_x4) / 4; +#endif +#else align_mem16(&fp, 0); x_4_addr = fp; memcpy(fp, x4, x8_soft - x4); @@ -257,9 +269,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { #endif -#ifdef __ARM_NEON__ - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - MOVI(&fp, 11, p->i0); +#ifdef __arm__ + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; + #ifdef __ARM_NEON__ + MOVI(&fp, 11, p->i0); + #else + MOVI(&fp, 11, p->i0); + #endif + #else align_mem16(&fp, 0); start = fp; @@ -273,15 +290,20 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); #endif //fp++; +#ifdef __arm__ #ifdef __ARM_NEON__ memcpy(fp, neon_ee, neon_oo - neon_ee); - if(sign < 0) { - fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; - fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } + if(sign < 0) { + fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } fp += (neon_oo - neon_ee) / 4; #else + memcpy(fp, vfp_e, vfp_o - vfp_e); + fp += (vfp_o - vfp_e) / 4; +#endif +#else //fprintf(stderr, "Body start address = %016p\n", start); PUSH(&fp, RBP); @@ -403,14 +425,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { if(pps[0] == 2*leafN) { - CALL(&fp, x_4_addr); + // CALL(&fp, x_4_addr); // }else if(!pps[2]){ // //uint32_t *x_8_t_addr = fp; // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); // fp += (neon_ee - neon_x8_t) / 4; // //*fp++ = BL(fp+2, x_8_t_addr); }else{ - CALL(&fp, x_8_addr); + // CALL(&fp, x_8_addr); } pAddr = pps[1] * 4; @@ -422,6 +444,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { pps += 2; } #endif +#ifdef __arm__ #ifdef __ARM_NEON__ if(__builtin_ctzl(N) & 1){ ADDI(&fp, 2, 7, 0); @@ -519,7 +542,45 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { fp += (neon_oo - neon_ee) / 4; } +#else + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); + memcpy(fp, vfp_o, vfp_x4 - vfp_o); + fp += (vfp_x4 - vfp_o) / 4; + + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; + MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); + memcpy(fp, vfp_e, vfp_o - vfp_e); + fp += (vfp_o - vfp_e) / 4; + +#endif *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 //ADDI(&fp, 2, 1, 0); MOVI(&fp, 1, 0); @@ -551,6 +612,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { *fp = BL(fp+2, x_4_addr); fp++; }else if(!pps[2]){ //uint32_t *x_8_t_addr = fp; +#ifdef __ARM_NEON__ memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); if(sign < 0) { fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; @@ -559,6 +621,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { } fp += (neon_ee - neon_x8_t) / 4; //*fp++ = BL(fp+2, x_8_t_addr); + +#else + *fp = BL(fp+2, x_8_addr); fp++; +#endif }else{ *fp = BL(fp+2, x_8_addr); fp++; } @@ -612,7 +678,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { exit(1); } #ifdef __APPLE__ -// sys_icache_invalidate(func, p->transform_size); + sys_icache_invalidate(func, p->transform_size); #elif __ANDROID__ cacheflush((long)(func), (long)(func) + p->transform_size, 0); #elif __linux__ diff --git a/src/ffts.c b/src/ffts.c index 320db2c..481a66e 100644 --- a/src/ffts.c +++ b/src/ffts.c @@ -104,13 +104,30 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { if(N >= 32) { ffts_init_offsets(p, N, leafN); - ffts_init_is(p, N, leafN, 2); +#ifdef __arm__ +#ifdef __ARM_NEON__ + ffts_init_is(p, N, leafN, 1); +#else + ffts_init_is(p, N, leafN, 1); +#endif +#else + ffts_init_is(p, N, leafN, 1); +#endif p->i0 = N/leafN/3+1; p->i1 = N/leafN/3; if((N/leafN) % 3 > 1) p->i1++; - p->i0/=2; - p->i1/=2; + p->i2 = N/leafN/3; + + #ifdef __arm__ + #ifdef __ARM_NEON__ + p->i0/=2; + p->i1/=2; + #endif + #else + p->i0/=2; + p->i1/=2; + #endif }else{ p->transforms = malloc(2 * sizeof(transform_index_t)); @@ -198,7 +215,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { float *fw0 = (float *)w0; - #ifdef __ARM_NEON__ + #ifdef __arm__ if(N < 32) { //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); float *fw = (float *)w; @@ -217,11 +234,18 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); float *fw = (float *)w; VS temp0, temp1, temp2; + #ifdef __ARM_NEON__ for(j=0;j + Copyright (c) 2012, 2013 The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __VFP_H__ +#define __VFP_H__ + +#include "ffts.h" + +void vfp_e(); +void vfp_o(); +void vfp_x4(); +void vfp_x8(); +void vfp_end(); + +#endif diff --git a/src/vfp.s b/src/vfp.s new file mode 100644 index 0000000..cd865b8 --- /dev/null +++ b/src/vfp.s @@ -0,0 +1,483 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013 Anthony M. Blake + Copyright (c) 2012, 2013 The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 = const pointer +@ & lr = temps + + .align 4 +#ifdef __APPLE__ + .globl _vfp_e +_vfp_e: +#else + .globl vfp_e +vfp_e: +#endif +_vfp_e_loop: + vldr s9, [r8] @ x5 + vldr s18, [r4] @ x1 + vldr s12, [r8, #4] + vldr s7, [r2] + vldr s4, [r9] @ x6 + vldr s14, [r9, #4] + vldr s16, [r2, #8] + vldr s6, [r10, #4] + vldr s0, [r10] @ x7 + vldr s20, [r3] @ x0 + vldr s11, [r7] @ x4 + vldr s5, [r7, #4] + vsub.f32 s8, s4, s0 + vsub.f32 s1, s14, s6 + vsub.f32 s10, s5, s12 + vsub.f32 s3, s11, s9 + vldr s30, [r5, #4] + vldr s26, [r6, #4] + vadd.f32 s0, s4, s0 + vadd.f32 s4, s11, s9 + vldr s24, [r6] @ x3 + vldr s28, [r5] @ x2 + vadd.f32 s14, s14, s6 + vadd.f32 s5, s5, s12 + vadd.f32 s2, s20, s18 + vsub.f32 s6, s20, s18 + vmul.f32 s9, s3, s7 + vmul.f32 s11, s3, s16 + vmul.f32 s15, s8, s7 + vmul.f32 s13, s10, s16 + vmul.f32 s20, s10, s7 + vmul.f32 s22, s1, s16 + vadd.f32 s12, s30, s26 + vsub.f32 s3, s30, s26 + vmul.f32 s26, s8, s16 + vldr s16, [r4, #4] + vmul.f32 s30, s1, s7 + vldr s7, [r3, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vsub.f32 s10, s9, s13 + vadd.f32 s18, s28, s24 + vadd.f32 s8, s15, s22 + vadd.f32 s22, s7, s16 + vsub.f32 s13, s28, s24 + vsub.f32 s16, s7, s16 + vadd.f32 s28, s20, s11 + vsub.f32 s26, s30, s26 + vadd.f32 s1, s4, s0 + vadd.f32 s9, s2, s18 + vsub.f32 s11, s2, s18 + vadd.f32 s15, s22, s12 + vadd.f32 s7, s5, s14 + vsub.f32 s18, s5, s14 + vsub.f32 s5, s4, s0 + vsub.f32 s22, s22, s12 + vadd.f32 s12, s10, s8 + vadd.f32 s2, s6, s3 + vsub.f32 s0, s6, s3 + vsub.f32 s8, s10, s8 + vsub.f32 s4, s16, s13 + vadd.f32 s14, s16, s13 + vadd.f32 s3, s28, s26 + vsub.f32 s6, s28, s26 + vadd.f32 s20, s9, s1 + vsub.f32 s16, s9, s1 + vadd.f32 s10, s2, s12 + vadd.f32 s24, s15, s7 + vsub.f32 s13, s15, s7 + vadd.f32 s28, s11, s18 + vsub.f32 s7, s11, s18 + vadd.f32 s9, s0, s6 +ldr lr, [r12], #4 +add lr, r0, lr, lsl #2 +subs r11, r11, #1 + vstr s20, [lr] + vsub.f32 s6, s0, s6 + vsub.f32 s26, s22, s5 + vadd.f32 s1, s22, s5 + vstr s24, [lr, #4] + vadd.f32 s5, s4, s3 + vsub.f32 s11, s14, s8 + vstr s28, [lr, #16] + vsub.f32 s2, s2, s12 + vsub.f32 s4, s4, s3 + vadd.f32 s0, s14, s8 + vstr s26, [lr, #20] + vstr s16, [lr, #32] + vstr s13, [lr, #36] + vstr s7, [lr, #48] + vstr s1, [lr, #52] + vstr s10, [lr, #8] + vstr s5, [lr, #12] + vstr s9, [lr, #24] + vstr s11, [lr, #28] + vstr s2, [lr, #40] + vstr s4, [lr, #44] + vstr s6, [lr, #56] + vstr s0, [lr, #60] + bne _vfp_e_loop + +@ assumes r0 = out +@ r1 = in ? +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .align 4 +#ifdef __APPLE__ + .globl _vfp_o +_vfp_o: +#else + .globl vfp_o +vfp_o: +#endif + _vfp_o_loop: + vldr s2, [r4] + vldr s0, [r4, #4] + vldr s12, [r5] + vldr s4, [r6] + vldr s14, [r6, #4] + vldr s3, [r5, #4] + vadd.f32 s6, s12, s4 + vldr s7, [r3] + vldr s5, [r3, #4] + subs r11, r11, #1 + ldr r2, [r12], #4 + add r2, r0, r2, lsl #2 + vadd.f32 s8, s3, s14 + vadd.f32 s10, s7, s2 + vadd.f32 s1, s5, s0 + vsub.f32 s14, s3, s14 + vsub.f32 s3, s7, s2 + vsub.f32 s2, s12, s4 + vsub.f32 s0, s5, s0 + vadd.f32 s9, s10, s6 + vsub.f32 s10, s10, s6 + vadd.f32 s7, s1, s8 + vadd.f32 s5, s3, s14 + vsub.f32 s4, s3, s14 + vldr s3, [r9, #4] + vsub.f32 s12, s0, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r8] + vsub.f32 s6, s1, s8 + vstr s9, [r2] + vstr s7, [r2, #4] + vstr s5, [r2, #8] + vstr s12, [r2, #12] + vstr s10, [r2, #16] + vstr s6, [r2, #20] + vstr s4, [r2, #24] + vstr s0, [r2, #28] + vldr s0, [r8, #4] + vldr s12, [r9] + vldr s4, [r10] + vldr s14, [r10, #4] + vldr s7, [r7] + vadd.f32 s6, s12, s4 + vldr s5, [r7, #4] + add r3, r3, #8 + add r4, r4, #8 + add r5, r5, #8 + add r6, r6, #8 + add r7, r7, #8 + add r8, r8, #8 + add r9, r9, #8 + add r10, r10, #8 + vadd.f32 s8, s3, s14 + vadd.f32 s10, s7, s2 + vadd.f32 s1, s5, s0 + vsub.f32 s14, s3, s14 + vsub.f32 s3, s7, s2 + vsub.f32 s2, s12, s4 + vsub.f32 s0, s5, s0 + vadd.f32 s9, s10, s6 + vsub.f32 s10, s10, s6 + vadd.f32 s7, s1, s8 + vadd.f32 s5, s3, s14 + vsub.f32 s6, s1, s8 + vsub.f32 s12, s0, s2 + vsub.f32 s4, s3, s14 + vadd.f32 s0, s0, s2 + vstr s9, [r2, #32] + vstr s7, [r2, #36] + vstr s5, [r2, #40] + vstr s12, [r2, #44] + vstr s10, [r2, #48] + vstr s6, [r2, #52] + vstr s4, [r2, #56] + vstr s0, [r2, #60] + bne _vfp_o_loop + + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x4 +_vfp_x4: +#else + .globl vfp_x4 +vfp_x4: +#endif + add r3, r0, #0 + add r7, r2, #0 + add r4, r0, r1, lsl #1 + add r5, r0, r1, lsl #2 + add r6, r4, r1, lsl #2 + mov r11, #4 +_vfp_x4_loop: + + vldr s8, [r3, #0] + vldr s9, [r3, #4] + vldr s10, [r4, #0] + vldr s11, [r4, #4] + vldr s12, [r5, #0] + vldr s13, [r5, #4] + vldr s14, [r6, #0] + vldr s15, [r6, #4] + vldr s2, [r7, #0] + vldr s3, [r7, #4] + add r7, r7, #8 + subs r11, r11, #1 + vmul.f32 s0, s13, s3 + vmul.f32 s5, s12, s2 + vmul.f32 s1, s14, s2 + vmul.f32 s4, s14, s3 + vmul.f32 s14, s12, s3 + vmul.f32 s13, s13, s2 + vmul.f32 s12, s15, s3 + vmul.f32 s2, s15, s2 + vsub.f32 s0, s5, s0 + vadd.f32 s13, s13, s14 + vadd.f32 s12, s12, s1 + vsub.f32 s1, s2, s4 + vadd.f32 s15, s0, s12 + vsub.f32 s12, s0, s12 + vadd.f32 s14, s13, s1 + vsub.f32 s13, s13, s1 + vadd.f32 s0, s8, s15 + vadd.f32 s1, s9, s14 + vadd.f32 s2, s10, s13 @ + vsub.f32 s4, s8, s15 + vsub.f32 s3, s11, s12 @ + vstr s0, [r3, #0] + vstr s1, [r3, #4] + add r3, r3, #8 + vsub.f32 s5, s9, s14 + vsub.f32 s6, s10, s13 @ + vadd.f32 s7, s11, s12 @ + vstr s2, [r4, #0] + vstr s3, [r4, #4] + add r4, r4, #8 + vstr s4, [r5, #0] + vstr s5, [r5, #4] + add r5, r5, #8 + vstr s6, [r6, #0] + vstr s7, [r6, #4] + add r6, r6, #8 + bne _vfp_x4_loop + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _vfp_x8 +_vfp_x8: +#else + .globl vfp_x8 +vfp_x8: +#endif + mov r11, #0 + add r3, r0, #0 @ data0 + add r5, r0, r1, lsl #1 @ data2 + add r4, r0, r1 @ data1 + add r7, r5, r1, lsl #1 @ data4 + add r6, r5, r1 @ data3 + add r9, r7, r1, lsl #1 @ data6 + add r8, r7, r1 @ data5 + add r10, r9, r1 @ data7 + add r12, r2, #0 @ LUT + + sub r11, r11, r1, lsr #3 +_vfp_x8_loop: + vldr s2, [r12, #0] + vldr s3, [r12, #4] + add r12, r12, #8 + vldr s14, [r6, #0] + vldr s15, [r6, #4] + vldr s10, [r5, #0] + vldr s11, [r5, #4] + adds r11, r11, #1 + vmul.f32 s12, s15, s2 + vmul.f32 s8, s14, s3 + vmul.f32 s13, s14, s2 + vmul.f32 s9, s10, s3 + vmul.f32 s1, s10, s2 + vmul.f32 s0, s11, s2 + vmul.f32 s14, s11, s3 + vmul.f32 s15, s15, s3 + vldr s2, [r12, #0] + vldr s3, [r12, #4] + add r12, r12, #8 + vsub.f32 s10, s12, s8 + vadd.f32 s11, s0, s9 + vadd.f32 s8, s15, s13 + vldr s12, [r4, #0] + vldr s13, [r4, #4] + vsub.f32 s9, s1, s14 + vsub.f32 s15, s11, s10 + vsub.f32 s14, s9, s8 + vadd.f32 s4, s12, s15 @ + vsub.f32 s6, s12, s15 @ + vsub.f32 s5, s13, s14 @ + vadd.f32 s7, s13, s14 @ + vldr s14, [r9, #0] + vldr s15, [r9, #4] + vldr s12, [r7, #0] + vldr s13, [r7, #4] + vmul.f32 s1, s14, s2 + vmul.f32 s0, s14, s3 + vstr s4, [r4, #0] + vstr s5, [r4, #4] + vmul.f32 s14, s15, s3 + vmul.f32 s4, s15, s2 + vadd.f32 s15, s9, s8 + vstr s6, [r6, #0] + vstr s7, [r6, #4] + vmul.f32 s8, s12, s3 + vmul.f32 s5, s13, s3 + vmul.f32 s12, s12, s2 + vmul.f32 s9, s13, s2 + vadd.f32 s14, s14, s1 + vsub.f32 s13, s4, s0 + vadd.f32 s0, s9, s8 + vldr s8, [r3, #0] + vldr s9, [r3, #4] + vadd.f32 s1, s11, s10 + vsub.f32 s12, s12, s5 + vadd.f32 s11, s8, s15 + vsub.f32 s8, s8, s15 + vadd.f32 s2, s12, s14 + vsub.f32 s10, s0, s13 + vadd.f32 s15, s0, s13 + vadd.f32 s13, s9, s1 + vsub.f32 s9, s9, s1 + vsub.f32 s12, s12, s14 + vadd.f32 s0, s11, s2 + vadd.f32 s1, s13, s15 + vsub.f32 s4, s11, s2 + vadd.f32 s2, s8, s10 @ + vsub.f32 s3, s9, s12 @ + vstr s0, [r3, #0] + vstr s1, [r3, #4] + add r3, r3, #8 + vsub.f32 s5, s13, s15 + vldr s14, [r10, #0] + vldr s15, [r10, #4] + vadd.f32 s7, s9, s12 @ + vldr s12, [r8, #0] + vldr s13, [r8, #4] + vstr s2, [r5, #0] + vstr s3, [r5, #4] + add r5, r5, #8 + vldr s2, [r12, #0] + vldr s3, [r12, #4] + add r12, r12, #8 + vsub.f32 s6, s8, s10 @ + vmul.f32 s8, s14, s2 + vstr s4, [r7, #0] + vstr s5, [r7, #4] + add r7, r7, #8 + vmul.f32 s10, s15, s3 + vmul.f32 s9, s13, s3 + vmul.f32 s11, s12, s2 + vmul.f32 s14, s14, s3 + vstr s6, [r9, #0] + vstr s7, [r9, #4] + add r9, r9, #8 + vmul.f32 s15, s15, s2 + vmul.f32 s12, s12, s3 + vmul.f32 s13, s13, s2 + vadd.f32 s10, s10, s8 + vsub.f32 s11, s11, s9 + vldr s8, [r4, #0] + vldr s9, [r4, #4] + vsub.f32 s14, s15, s14 + vadd.f32 s15, s13, s12 + vadd.f32 s13, s11, s10 + vadd.f32 s12, s15, s14 + vsub.f32 s15, s15, s14 + vsub.f32 s14, s11, s10 + vldr s10, [r6, #0] + vldr s11, [r6, #4] + vadd.f32 s0, s8, s13 + vadd.f32 s1, s9, s12 + vadd.f32 s2, s10, s15 @ + vsub.f32 s3, s11, s14 @ + vsub.f32 s4, s8, s13 + vstr s0, [r4, #0] + vstr s1, [r4, #4] + add r4, r4, #8 + vsub.f32 s5, s9, s12 + vsub.f32 s6, s10, s15 @ + vstr s2, [r6, #0] + vstr s3, [r6, #4] + add r6, r6, #8 + vadd.f32 s7, s11, s14 @ + vstr s4, [r8, #0] + vstr s5, [r8, #4] + add r8, r8, #8 + vstr s6, [r10, #0] + vstr s7, [r10, #4] + add r10, r10, #8 + bne _vfp_x8_loop + bx lr + + .align 4 +#ifdef __APPLE__ + .globl _vfp_end +_vfp_end: +#else + .globl vfp_end +vfp_end: +#endif + bx lr -- cgit v1.1