diff options
Diffstat (limited to 'src/ffts_static.c')
-rw-r--r-- | src/ffts_static.c | 86 |
1 files changed, 52 insertions, 34 deletions
diff --git a/src/ffts_static.c b/src/ffts_static.c index 00cc96f..e1b2f6b 100644 --- a/src/ffts_static.c +++ b/src/ffts_static.c @@ -967,17 +967,19 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - neon_static_x8_f(data , 32, ws1); - neon_static_x4_f(data + 64, 16, ws); - neon_static_x4_f(data + 96, 16, ws); + neon_static_x8_f(data, 32, ws1); + + neon_static_x4_f(data + 64, ws); + neon_static_x4_f(data + 96, ws); + neon_static_x8_f(data + 128, 32, ws1); neon_static_x8_f(data + 192, 32, ws1); neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - neon_static_x4_f(data , 16, ws); - neon_static_x4_f(data + 64, 16, ws); - neon_static_x4_f(data + 96, 16, ws); + neon_static_x4_f(data , ws); + neon_static_x4_f(data + 64, ws); + neon_static_x4_f(data + 96, ws); neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1)); } else { @@ -1041,17 +1043,19 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - neon_static_x8_i(data , 32, ws1); - neon_static_x4_i(data + 64, 16, ws); - neon_static_x4_i(data + 96, 16, ws); + neon_static_x8_i(data, 32, ws1); + + neon_static_x4_i(data + 64, ws); + neon_static_x4_i(data + 96, ws); + neon_static_x8_i(data + 128, 32, ws1); neon_static_x8_i(data + 192, 32, ws1); neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - neon_static_x4_i(data , 16, ws); - neon_static_x4_i(data + 64, 16, ws); - neon_static_x4_i(data + 96, 16, ws); + neon_static_x4_i(data , ws); + neon_static_x4_i(data + 64, ws); + neon_static_x4_i(data + 96, ws); neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1)); } else { @@ -1122,21 +1126,28 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) ffts_static_rec_f_32f(p, dout + N1 + N2, N3); ffts_static_rec_f_32f(p, dout + N , N2); ffts_static_rec_f_32f(p, dout + N + N1 , N2); + + neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); } else if (N == 128) { - const float *ws1 = ws + (p->ws_is[1] << 1); + neon_static_x8_f(dout, 32, ws + 8); + + neon_static_x4_f(dout + 64, ws); + neon_static_x4_f(dout + 96, ws); + + neon_static_x8_f(dout + 128, 32, ws + 8); + neon_static_x8_f(dout + 192, 32, ws + 8); - neon_static_x8_f(dout , 32, ws1); - neon_static_x4_f(dout + 64, 16, ws); - neon_static_x4_f(dout + 96, 16, ws); - neon_static_x8_f(dout + 128, 32, ws1); - neon_static_x8_f(dout + 192, 32, ws1); + neon_static_x8_t_f(dout, 128, ws + 80); } else if (N == 64) { - neon_static_x4_f(dout , 16, ws); - neon_static_x4_f(dout + 64, 16, ws); - neon_static_x4_f(dout + 96, 16, ws); - } + neon_static_x4_f(dout , ws); + neon_static_x4_f(dout + 64, ws); + neon_static_x4_f(dout + 96, ws); - neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); + neon_static_x8_t_f(dout, 64, ws + 32); + } else { + assert(N == 32); + neon_static_x8_t_f(dout, 32, ws + 8); + } #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 0); @@ -1176,21 +1187,28 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) ffts_static_rec_i_32f(p, dout + N1 + N2, N3); ffts_static_rec_i_32f(p, dout + N , N2); ffts_static_rec_i_32f(p, dout + N + N1 , N2); + + neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); } else if (N == 128) { - const float *ws1 = ws + (p->ws_is[1] << 1); + neon_static_x8_i(dout, 32, ws + 8); + + neon_static_x4_i(dout + 64, ws); + neon_static_x4_i(dout + 96, ws); - neon_static_x8_i(dout , 32, ws1); - neon_static_x4_i(dout + 64, 16, ws); - neon_static_x4_i(dout + 96, 16, ws); - neon_static_x8_i(dout + 128, 32, ws1); - neon_static_x8_i(dout + 192, 32, ws1); + neon_static_x8_i(dout + 128, 32, ws + 8); + neon_static_x8_i(dout + 192, 32, ws + 8); + + neon_static_x8_t_i(dout, 128, ws + 80); } else if (N == 64) { - neon_static_x4_i(dout , 16, ws); - neon_static_x4_i(dout + 64, 16, ws); - neon_static_x4_i(dout + 96, 16, ws); - } + neon_static_x4_i(dout , ws); + neon_static_x4_i(dout + 64, ws); + neon_static_x4_i(dout + 96, ws); - neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); + neon_static_x8_t_i(dout, 64, ws + 32); + } else { + assert(N == 32); + neon_static_x8_t_i(dout, 32, ws + 8); + } #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 1); |