diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2016-03-11 14:32:22 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2016-03-11 14:32:22 +0200 |
commit | e667ca5e4304b31cd7093eaead481b032092b985 (patch) | |
tree | df057e6fa4502d1924eddb9bf496e5e9d338a417 | |
parent | 2051c214d591be08e40fdba623ccefabbba11b29 (diff) | |
download | ffts-e667ca5e4304b31cd7093eaead481b032092b985.zip ffts-e667ca5e4304b31cd7093eaead481b032092b985.tar.gz |
Restore ARM NEON optimized recursive version
-rw-r--r-- | src/ffts.c | 14 | ||||
-rw-r--r-- | src/ffts_static.c | 84 |
2 files changed, 85 insertions, 13 deletions
@@ -55,7 +55,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__arm__) && !defined(DYNAMIC_DISABLED) +#if defined(HAVE_NEON) static const FFTS_ALIGN(64) float w_data[16] = { 0.70710678118654757273731092936941f, 0.70710678118654746171500846685376f, @@ -227,7 +227,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) if (n_luts) { size_t lut_size; -#if defined(__arm__) && !defined(DYNAMIC_DISABLED) +#if defined(__arm__) && !defined(HAVE_NEON) lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f) / 2; #else lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f); @@ -272,7 +272,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w0[j][1] = tmp[j * stride][1]; } -#if defined(__arm__) && !defined(DYNAMIC_DISABLED) +#if defined(__arm__) #ifdef HAVE_NEON for (j = 0; j < n/4; j += 4) { V4SF2 temp0 = V4SF2_LD(fw0 + j*2); @@ -323,7 +323,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w2[j][1] = tmp[(j + (n/8)) * stride][1]; } -#if defined(__arm__) && !defined(DYNAMIC_DISABLED) +#if defined(__arm__) #ifdef HAVE_NEON for (j = 0; j < n/8; j += 4) { V4SF2 temp0, temp1, temp2; @@ -389,11 +389,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) stride >>= 1; } -#if defined(__arm__) && !defined(DYNAMIC_DISABLED) +#if defined(HAVE_NEON) if (sign < 0) { - p->oe_ws = (void*)(&w_data[4]); + p->oe_ws = (void*)(w_data + 4); p->ee_ws = (void*)(w_data); - p->eo_ws = (void*)(&w_data[4]); + p->eo_ws = (void*)(w_data + 4); } else { p->oe_ws = (void*)(w_data + 12); p->ee_ws = (void*)(w_data + 8); diff --git a/src/ffts_static.c b/src/ffts_static.c index 701cca8..7747de0 100644 --- a/src/ffts_static.c +++ b/src/ffts_static.c @@ -36,6 +36,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts_internal.h" #include "macros.h" +#if defined(HAVE_NEON) +#include "neon.h" +#endif + #include <assert.h> static const FFTS_ALIGN(16) float ffts_constants_small_32f[24] = { @@ -945,6 +949,28 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out, static void ffts_static_rec_f_32f(ffts_plan_t *p, float *data, size_t N) { +#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) + if (N > 16) { + size_t N1 = N >> 1; + size_t N2 = N >> 2; + size_t N3 = N >> 3; + float *ws = ((float *)(p->ws)) + (p->ws_is[ffts_ctzl(N)-4] << 1); + + ffts_static_rec_f_32f(p, data, N2); + ffts_static_rec_f_32f(p, data + N1, N3); + ffts_static_rec_f_32f(p, data + N1 + N2, N3); + ffts_static_rec_f_32f(p, data + N, N2); + ffts_static_rec_f_32f(p, data + N + N1, N2); + + if (N == p->N) { + neon_static_x8_t_f(data, N, ws); + } else { + neon_static_x8_f(data, N, ws); + } + } else if (N == 16) { + neon_static_x4_f(data, N, p->ws); + } +#else const float *ws = (float*) p->ws; if (N > 128) { @@ -983,11 +1009,34 @@ ffts_static_rec_f_32f(ffts_plan_t *p, float *data, size_t N) assert(N == 16); V4SF_X_4(0, data, N, ws); } +#endif } static void ffts_static_rec_i_32f(ffts_plan_t *p, float *data, size_t N) { +#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) + if (N > 16) { + size_t N1 = N >> 1; + size_t N2 = N >> 2; + size_t N3 = N >> 3; + float *ws = ((float *)(p->ws)) + (p->ws_is[ffts_ctzl(N)-4] << 1); + + ffts_static_rec_i_32f(p, data, N2); + ffts_static_rec_i_32f(p, data + N1, N3); + ffts_static_rec_i_32f(p, data + N1 + N2, N3); + ffts_static_rec_i_32f(p, data + N, N2); + ffts_static_rec_i_32f(p, data + N + N1, N2); + + if (N == p->N) { + neon_static_x8_t_i(data, N, ws); + } else { + neon_static_x8_i(data, N, ws); + } + } else if(N==16) { + neon_static_x4_i(data, N, p->ws); + } +#else float *ws = (float*) p->ws; if (N > 128) { @@ -1026,28 +1075,51 @@ ffts_static_rec_i_32f(ffts_plan_t *p, float *data, size_t N) assert(N == 16); V4SF_X_4(1, data, N, ws); } +#endif } void ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) { + const float *din = (const float*) in; + float *dout = (float*) out; + +#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) if (ffts_ctzl(p->N) & 1) { - ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 0); + neon_static_o_f(p, din, dout); } else { - ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 0); + neon_static_e_f(p, din, dout); } +#else + if (ffts_ctzl(p->N) & 1) { + ffts_static_firstpass_odd_32f(dout, din, p, 0); + } else { + ffts_static_firstpass_even_32f(dout, din, p, 0); + } +#endif - ffts_static_rec_f_32f(p, (float*) out, p->N); + ffts_static_rec_f_32f(p, dout, p->N); } void ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) { + const float *din = (const float*) in; + float *dout = (float*) out; + +#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) + if (ffts_ctzl(p->N) & 1) { + neon_static_o_i(p, din, dout); + } else { + neon_static_e_i(p, din, dout); + } +#else if (ffts_ctzl(p->N) & 1) { - ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 1); + ffts_static_firstpass_odd_32f(dout, din, p, 1); } else { - ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 1); + ffts_static_firstpass_even_32f(dout, din, p, 1); } +#endif - ffts_static_rec_i_32f(p, (float*) out, p->N); + ffts_static_rec_i_32f(p, dout, p->N); }
\ No newline at end of file |