summaryrefslogtreecommitdiffstats
path: root/src/ffts_static.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-15 11:08:23 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-15 11:08:23 +0200
commitf64d89a9385e5981a3e175a205ee3fdf69773e61 (patch)
tree7e5045c78afec2da80b70fde3bc58e22a4484024 /src/ffts_static.c
parent85a7167fdb139dc249330d34120080109878ea8f (diff)
downloadffts-f64d89a9385e5981a3e175a205ee3fdf69773e61.zip
ffts-f64d89a9385e5981a3e175a205ee3fdf69773e61.tar.gz
neon_static_x4_f and neon_static_x4_i don't use the second passed argument, and reschedule instructions for possible dual issue
Diffstat (limited to 'src/ffts_static.c')
-rw-r--r--src/ffts_static.c86
1 files changed, 52 insertions, 34 deletions
diff --git a/src/ffts_static.c b/src/ffts_static.c
index 00cc96f..e1b2f6b 100644
--- a/src/ffts_static.c
+++ b/src/ffts_static.c
@@ -967,17 +967,19 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_f(data , 32, ws1);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x8_f(data, 32, ws1);
+
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
+
neon_static_x8_f(data + 128, 32, ws1);
neon_static_x8_f(data + 192, 32, ws1);
neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_f(data , 16, ws);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x4_f(data , ws);
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1041,17 +1043,19 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_i(data , 32, ws1);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x8_i(data, 32, ws1);
+
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
+
neon_static_x8_i(data + 128, 32, ws1);
neon_static_x8_i(data + 192, 32, ws1);
neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_i(data , 16, ws);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x4_i(data , ws);
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1122,21 +1126,28 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_f_32f(p, dout + N1 + N2, N3);
ffts_static_rec_f_32f(p, dout + N , N2);
ffts_static_rec_f_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_f(dout, 32, ws + 8);
+
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
+
+ neon_static_x8_f(dout + 128, 32, ws + 8);
+ neon_static_x8_f(dout + 192, 32, ws + 8);
- neon_static_x8_f(dout , 32, ws1);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- neon_static_x8_f(dout + 128, 32, ws1);
- neon_static_x8_f(dout + 192, 32, ws1);
+ neon_static_x8_t_f(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_f(dout , 16, ws);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- }
+ neon_static_x4_f(dout , ws);
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
- neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_f(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_f(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 0);
@@ -1176,21 +1187,28 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_i_32f(p, dout + N1 + N2, N3);
ffts_static_rec_i_32f(p, dout + N , N2);
ffts_static_rec_i_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_i(dout, 32, ws + 8);
+
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_i(dout , 32, ws1);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- neon_static_x8_i(dout + 128, 32, ws1);
- neon_static_x8_i(dout + 192, 32, ws1);
+ neon_static_x8_i(dout + 128, 32, ws + 8);
+ neon_static_x8_i(dout + 192, 32, ws + 8);
+
+ neon_static_x8_t_i(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_i(dout , 16, ws);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- }
+ neon_static_x4_i(dout , ws);
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_i(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_i(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 1);
OpenPOWER on IntegriCloud