summaryrefslogtreecommitdiffstats
path: root/src/ffts_static.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/ffts_static.c')
-rw-r--r--src/ffts_static.c86
1 files changed, 52 insertions, 34 deletions
diff --git a/src/ffts_static.c b/src/ffts_static.c
index 00cc96f..e1b2f6b 100644
--- a/src/ffts_static.c
+++ b/src/ffts_static.c
@@ -967,17 +967,19 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_f(data , 32, ws1);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x8_f(data, 32, ws1);
+
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
+
neon_static_x8_f(data + 128, 32, ws1);
neon_static_x8_f(data + 192, 32, ws1);
neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_f(data , 16, ws);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x4_f(data , ws);
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1041,17 +1043,19 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_i(data , 32, ws1);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x8_i(data, 32, ws1);
+
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
+
neon_static_x8_i(data + 128, 32, ws1);
neon_static_x8_i(data + 192, 32, ws1);
neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_i(data , 16, ws);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x4_i(data , ws);
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1122,21 +1126,28 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_f_32f(p, dout + N1 + N2, N3);
ffts_static_rec_f_32f(p, dout + N , N2);
ffts_static_rec_f_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_f(dout, 32, ws + 8);
+
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
+
+ neon_static_x8_f(dout + 128, 32, ws + 8);
+ neon_static_x8_f(dout + 192, 32, ws + 8);
- neon_static_x8_f(dout , 32, ws1);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- neon_static_x8_f(dout + 128, 32, ws1);
- neon_static_x8_f(dout + 192, 32, ws1);
+ neon_static_x8_t_f(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_f(dout , 16, ws);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- }
+ neon_static_x4_f(dout , ws);
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
- neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_f(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_f(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 0);
@@ -1176,21 +1187,28 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_i_32f(p, dout + N1 + N2, N3);
ffts_static_rec_i_32f(p, dout + N , N2);
ffts_static_rec_i_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_i(dout, 32, ws + 8);
+
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_i(dout , 32, ws1);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- neon_static_x8_i(dout + 128, 32, ws1);
- neon_static_x8_i(dout + 192, 32, ws1);
+ neon_static_x8_i(dout + 128, 32, ws + 8);
+ neon_static_x8_i(dout + 192, 32, ws + 8);
+
+ neon_static_x8_t_i(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_i(dout , 16, ws);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- }
+ neon_static_x4_i(dout , ws);
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_i(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_i(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 1);
OpenPOWER on IntegriCloud