summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-14 14:54:43 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-14 14:54:43 +0200
commit10d4d45b19639c2e5ee9b9289b262285954969c6 (patch)
tree9663adfb3c44d4c3b30da3cf73be4c4e2e878b59
parent61166019c3aa54a26e6e9baeb5af769402e0b616 (diff)
downloadffts-10d4d45b19639c2e5ee9b9289b262285954969c6.zip
ffts-10d4d45b19639c2e5ee9b9289b262285954969c6.tar.gz
Unroll to minimize recursive function call depth (overhead)
-rw-r--r--src/ffts_static.c136
1 files changed, 91 insertions, 45 deletions
diff --git a/src/ffts_static.c b/src/ffts_static.c
index 483b5e2..bf52732 100644
--- a/src/ffts_static.c
+++ b/src/ffts_static.c
@@ -952,7 +952,7 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
const float *ws = (const float*) p->ws;
#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
- if (N > 16) {
+ if (N > 128) {
const size_t N1 = N >> 1;
const size_t N2 = N >> 2;
const size_t N3 = N >> 3;
@@ -964,8 +964,27 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
ffts_static_rec_f_32f(p, data + N + N1 , N2);
neon_static_x8_f(data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
- } else if (N == 16) {
- neon_static_x4_f(data, N, ws);
+ } else if (N == 128) {
+ const float *ws1 = ws + (p->ws_is[1] << 1);
+
+ neon_static_x8_f(data , 32, ws1);
+ neon_static_x4_f(data + 64, 16, ws);
+ neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x8_f(data + 128, 32, ws1);
+ neon_static_x8_f(data + 192, 32, ws1);
+
+ neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1));
+ } else if (N == 64) {
+ neon_static_x4_f(data , 16, ws);
+ neon_static_x4_f(data + 64, 16, ws);
+ neon_static_x4_f(data + 96, 16, ws);
+
+ neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1));
+ } else if (N == 32) {
+ neon_static_x8_f(data, 32, ws + (p->ws_is[1] << 1));
+ } else {
+ assert(N == 16);
+ neon_static_x4_f(data, 16, ws);
}
#else
if (N > 128) {
@@ -983,26 +1002,24 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- V4SF_X_8(0, data + 0, 32, ws1);
+ V4SF_X_8(0, data + 0, 32, ws1);
+ V4SF_X_4(0, data + 64, 16, ws);
+ V4SF_X_4(0, data + 96, 16, ws);
+ V4SF_X_8(0, data + 128, 32, ws1);
+ V4SF_X_8(0, data + 192, 32, ws1);
- V4SF_X_4(0, data + 64, 16, ws);
- V4SF_X_4(0, data + 96, 16, ws);
-
- V4SF_X_8(0, data + 128, 32, ws1);
- V4SF_X_8(0, data + 192, 32, ws1);
-
- V4SF_X_8(0, data, N, ws + (p->ws_is[3] << 1));
+ V4SF_X_8(0, data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
V4SF_X_4(0, data + 0, 16, ws);
V4SF_X_4(0, data + 64, 16, ws);
V4SF_X_4(0, data + 96, 16, ws);
- V4SF_X_8(0, data + 0, N, ws + (p->ws_is[2] << 1));
+ V4SF_X_8(0, data, 64, ws + (p->ws_is[2] << 1));
} else if (N == 32) {
- V4SF_X_8(0, data, N, ws + (p->ws_is[1] << 1));
+ V4SF_X_8(0, data, 32, ws + (p->ws_is[1] << 1));
} else {
assert(N == 16);
- V4SF_X_4(0, data, N, ws);
+ V4SF_X_4(0, data, 16, ws);
}
#endif
}
@@ -1013,7 +1030,7 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
const float *ws = (const float*) p->ws;
#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
- if (N > 16) {
+ if (N > 128) {
const size_t N1 = N >> 1;
const size_t N2 = N >> 2;
const size_t N3 = N >> 3;
@@ -1025,8 +1042,27 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
ffts_static_rec_i_32f(p, data + N + N1 , N2);
neon_static_x8_i(data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
- } else if (N == 16) {
- neon_static_x4_i(data, N, ws);
+ } else if (N == 128) {
+ const float *ws1 = ws + (p->ws_is[1] << 1);
+
+ neon_static_x8_i(data , 32, ws1);
+ neon_static_x4_i(data + 64, 16, ws);
+ neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x8_i(data + 128, 32, ws1);
+ neon_static_x8_i(data + 192, 32, ws1);
+
+ neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1));
+ } else if (N == 64) {
+ neon_static_x4_i(data , 16, ws);
+ neon_static_x4_i(data + 64, 16, ws);
+ neon_static_x4_i(data + 96, 16, ws);
+
+ neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1));
+ } else if (N == 32) {
+ neon_static_x8_i(data, 32, ws + (p->ws_is[1] << 1));
+ } else {
+ assert(N == 16);
+ neon_static_x4_i(data, 16, ws);
}
#else
if (N > 128) {
@@ -1045,25 +1081,23 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
const float *ws1 = ws + (p->ws_is[1] << 1);
V4SF_X_8(1, data + 0, 32, ws1);
-
V4SF_X_4(1, data + 64, 16, ws);
V4SF_X_4(1, data + 96, 16, ws);
-
V4SF_X_8(1, data + 128, 32, ws1);
V4SF_X_8(1, data + 192, 32, ws1);
- V4SF_X_8(1, data, N, ws + (p->ws_is[3] << 1));
+ V4SF_X_8(1, data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
V4SF_X_4(1, data + 0, 16, ws);
V4SF_X_4(1, data + 64, 16, ws);
V4SF_X_4(1, data + 96, 16, ws);
- V4SF_X_8(1, data + 0, N, ws + (p->ws_is[2] << 1));
+ V4SF_X_8(1, data, 64, ws + (p->ws_is[2] << 1));
} else if (N == 32) {
- V4SF_X_8(1, data, N, ws + (p->ws_is[1] << 1));
+ V4SF_X_8(1, data, 32, ws + (p->ws_is[1] << 1));
} else {
assert(N == 16);
- V4SF_X_4(1, data, N, ws);
+ V4SF_X_4(1, data, 16, ws);
}
#endif
}
@@ -1078,11 +1112,7 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
const int N_log_2 = ffts_ctzl(N);
#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
- const size_t N1 = N >> 1;
- const size_t N2 = N >> 2;
- const size_t N3 = N >> 3;
-
- const float *ws = ((const float*) p->ws) + (p->ws_is[N_log_2 - 4] << 1);
+ const float *ws = (const float*) p->ws;
if (N_log_2 & 1) {
neon_static_o_f(p, din, dout);
@@ -1090,13 +1120,23 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
neon_static_e_f(p, din, dout);
}
- ffts_static_rec_f_32f(p, dout , N2);
- ffts_static_rec_f_32f(p, dout + N1 , N3);
- ffts_static_rec_f_32f(p, dout + N1 + N2, N3);
- ffts_static_rec_f_32f(p, dout + N , N2);
- ffts_static_rec_f_32f(p, dout + N + N1 , N2);
+ if (N > 64) {
+ const size_t N1 = N >> 1;
+ const size_t N2 = N >> 2;
+ const size_t N3 = N >> 3;
- neon_static_x8_t_f(dout, N, ws);
+ ffts_static_rec_f_32f(p, dout , N2);
+ ffts_static_rec_f_32f(p, dout + N1 , N3);
+ ffts_static_rec_f_32f(p, dout + N1 + N2, N3);
+ ffts_static_rec_f_32f(p, dout + N , N2);
+ ffts_static_rec_f_32f(p, dout + N + N1 , N2);
+ } else if (N == 64) {
+ neon_static_x4_f(dout , 16, ws);
+ neon_static_x4_f(dout + 64, 16, ws);
+ neon_static_x4_f(dout + 96, 16, ws);
+ }
+
+ neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 0);
@@ -1118,11 +1158,7 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
const int N_log_2 = ffts_ctzl(N);
#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
- const size_t N1 = N >> 1;
- const size_t N2 = N >> 2;
- const size_t N3 = N >> 3;
-
- const float *ws = ((const float*) p->ws) + (p->ws_is[N_log_2 - 4] << 1);
+ const float *ws = (const float*) p->ws;
if (N_log_2 & 1) {
neon_static_o_i(p, din, dout);
@@ -1130,13 +1166,23 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
neon_static_e_i(p, din, dout);
}
- ffts_static_rec_i_32f(p, dout , N2);
- ffts_static_rec_i_32f(p, dout + N1 , N3);
- ffts_static_rec_i_32f(p, dout + N1 + N2, N3);
- ffts_static_rec_i_32f(p, dout + N , N2);
- ffts_static_rec_i_32f(p, dout + N + N1 , N2);
+ if (N > 64) {
+ const size_t N1 = N >> 1;
+ const size_t N2 = N >> 2;
+ const size_t N3 = N >> 3;
+
+ ffts_static_rec_i_32f(p, dout , N2);
+ ffts_static_rec_i_32f(p, dout + N1 , N3);
+ ffts_static_rec_i_32f(p, dout + N1 + N2, N3);
+ ffts_static_rec_i_32f(p, dout + N , N2);
+ ffts_static_rec_i_32f(p, dout + N + N1 , N2);
+ } else if (N == 64) {
+ neon_static_x4_i(dout , 16, ws);
+ neon_static_x4_i(dout + 64, 16, ws);
+ neon_static_x4_i(dout + 96, 16, ws);
+ }
- neon_static_x8_t_i(dout, N, ws);
+ neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 1);
OpenPOWER on IntegriCloud