From 10d4d45b19639c2e5ee9b9289b262285954969c6 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 14 Mar 2016 14:54:43 +0200 Subject: Unroll to minimize recursive function call depth (overhead) --- src/ffts_static.c | 136 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 45 deletions(-) diff --git a/src/ffts_static.c b/src/ffts_static.c index 483b5e2..bf52732 100644 --- a/src/ffts_static.c +++ b/src/ffts_static.c @@ -952,7 +952,7 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) const float *ws = (const float*) p->ws; #if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) - if (N > 16) { + if (N > 128) { const size_t N1 = N >> 1; const size_t N2 = N >> 2; const size_t N3 = N >> 3; @@ -964,8 +964,27 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) ffts_static_rec_f_32f(p, data + N + N1 , N2); neon_static_x8_f(data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); - } else if (N == 16) { - neon_static_x4_f(data, N, ws); + } else if (N == 128) { + const float *ws1 = ws + (p->ws_is[1] << 1); + + neon_static_x8_f(data , 32, ws1); + neon_static_x4_f(data + 64, 16, ws); + neon_static_x4_f(data + 96, 16, ws); + neon_static_x8_f(data + 128, 32, ws1); + neon_static_x8_f(data + 192, 32, ws1); + + neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1)); + } else if (N == 64) { + neon_static_x4_f(data , 16, ws); + neon_static_x4_f(data + 64, 16, ws); + neon_static_x4_f(data + 96, 16, ws); + + neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1)); + } else if (N == 32) { + neon_static_x8_f(data, 32, ws + (p->ws_is[1] << 1)); + } else { + assert(N == 16); + neon_static_x4_f(data, 16, ws); } #else if (N > 128) { @@ -983,26 +1002,24 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - V4SF_X_8(0, data + 0, 32, ws1); + V4SF_X_8(0, data + 0, 32, ws1); + V4SF_X_4(0, data + 64, 16, ws); + V4SF_X_4(0, data + 96, 16, ws); + V4SF_X_8(0, data + 128, 32, ws1); + V4SF_X_8(0, data + 192, 32, ws1); - V4SF_X_4(0, data + 64, 16, ws); - V4SF_X_4(0, data + 96, 16, ws); - - V4SF_X_8(0, data + 128, 32, ws1); - V4SF_X_8(0, data + 192, 32, ws1); - - V4SF_X_8(0, data, N, ws + (p->ws_is[3] << 1)); + V4SF_X_8(0, data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { V4SF_X_4(0, data + 0, 16, ws); V4SF_X_4(0, data + 64, 16, ws); V4SF_X_4(0, data + 96, 16, ws); - V4SF_X_8(0, data + 0, N, ws + (p->ws_is[2] << 1)); + V4SF_X_8(0, data, 64, ws + (p->ws_is[2] << 1)); } else if (N == 32) { - V4SF_X_8(0, data, N, ws + (p->ws_is[1] << 1)); + V4SF_X_8(0, data, 32, ws + (p->ws_is[1] << 1)); } else { assert(N == 16); - V4SF_X_4(0, data, N, ws); + V4SF_X_4(0, data, 16, ws); } #endif } @@ -1013,7 +1030,7 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) const float *ws = (const float*) p->ws; #if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) - if (N > 16) { + if (N > 128) { const size_t N1 = N >> 1; const size_t N2 = N >> 2; const size_t N3 = N >> 3; @@ -1025,8 +1042,27 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) ffts_static_rec_i_32f(p, data + N + N1 , N2); neon_static_x8_i(data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); - } else if (N == 16) { - neon_static_x4_i(data, N, ws); + } else if (N == 128) { + const float *ws1 = ws + (p->ws_is[1] << 1); + + neon_static_x8_i(data , 32, ws1); + neon_static_x4_i(data + 64, 16, ws); + neon_static_x4_i(data + 96, 16, ws); + neon_static_x8_i(data + 128, 32, ws1); + neon_static_x8_i(data + 192, 32, ws1); + + neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1)); + } else if (N == 64) { + neon_static_x4_i(data , 16, ws); + neon_static_x4_i(data + 64, 16, ws); + neon_static_x4_i(data + 96, 16, ws); + + neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1)); + } else if (N == 32) { + neon_static_x8_i(data, 32, ws + (p->ws_is[1] << 1)); + } else { + assert(N == 16); + neon_static_x4_i(data, 16, ws); } #else if (N > 128) { @@ -1045,25 +1081,23 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) const float *ws1 = ws + (p->ws_is[1] << 1); V4SF_X_8(1, data + 0, 32, ws1); - V4SF_X_4(1, data + 64, 16, ws); V4SF_X_4(1, data + 96, 16, ws); - V4SF_X_8(1, data + 128, 32, ws1); V4SF_X_8(1, data + 192, 32, ws1); - V4SF_X_8(1, data, N, ws + (p->ws_is[3] << 1)); + V4SF_X_8(1, data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { V4SF_X_4(1, data + 0, 16, ws); V4SF_X_4(1, data + 64, 16, ws); V4SF_X_4(1, data + 96, 16, ws); - V4SF_X_8(1, data + 0, N, ws + (p->ws_is[2] << 1)); + V4SF_X_8(1, data, 64, ws + (p->ws_is[2] << 1)); } else if (N == 32) { - V4SF_X_8(1, data, N, ws + (p->ws_is[1] << 1)); + V4SF_X_8(1, data, 32, ws + (p->ws_is[1] << 1)); } else { assert(N == 16); - V4SF_X_4(1, data, N, ws); + V4SF_X_4(1, data, 16, ws); } #endif } @@ -1078,11 +1112,7 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) const int N_log_2 = ffts_ctzl(N); #if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) - const size_t N1 = N >> 1; - const size_t N2 = N >> 2; - const size_t N3 = N >> 3; - - const float *ws = ((const float*) p->ws) + (p->ws_is[N_log_2 - 4] << 1); + const float *ws = (const float*) p->ws; if (N_log_2 & 1) { neon_static_o_f(p, din, dout); @@ -1090,13 +1120,23 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) neon_static_e_f(p, din, dout); } - ffts_static_rec_f_32f(p, dout , N2); - ffts_static_rec_f_32f(p, dout + N1 , N3); - ffts_static_rec_f_32f(p, dout + N1 + N2, N3); - ffts_static_rec_f_32f(p, dout + N , N2); - ffts_static_rec_f_32f(p, dout + N + N1 , N2); + if (N > 64) { + const size_t N1 = N >> 1; + const size_t N2 = N >> 2; + const size_t N3 = N >> 3; - neon_static_x8_t_f(dout, N, ws); + ffts_static_rec_f_32f(p, dout , N2); + ffts_static_rec_f_32f(p, dout + N1 , N3); + ffts_static_rec_f_32f(p, dout + N1 + N2, N3); + ffts_static_rec_f_32f(p, dout + N , N2); + ffts_static_rec_f_32f(p, dout + N + N1 , N2); + } else if (N == 64) { + neon_static_x4_f(dout , 16, ws); + neon_static_x4_f(dout + 64, 16, ws); + neon_static_x4_f(dout + 96, 16, ws); + } + + neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 0); @@ -1118,11 +1158,7 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) const int N_log_2 = ffts_ctzl(N); #if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED) - const size_t N1 = N >> 1; - const size_t N2 = N >> 2; - const size_t N3 = N >> 3; - - const float *ws = ((const float*) p->ws) + (p->ws_is[N_log_2 - 4] << 1); + const float *ws = (const float*) p->ws; if (N_log_2 & 1) { neon_static_o_i(p, din, dout); @@ -1130,13 +1166,23 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) neon_static_e_i(p, din, dout); } - ffts_static_rec_i_32f(p, dout , N2); - ffts_static_rec_i_32f(p, dout + N1 , N3); - ffts_static_rec_i_32f(p, dout + N1 + N2, N3); - ffts_static_rec_i_32f(p, dout + N , N2); - ffts_static_rec_i_32f(p, dout + N + N1 , N2); + if (N > 64) { + const size_t N1 = N >> 1; + const size_t N2 = N >> 2; + const size_t N3 = N >> 3; + + ffts_static_rec_i_32f(p, dout , N2); + ffts_static_rec_i_32f(p, dout + N1 , N3); + ffts_static_rec_i_32f(p, dout + N1 + N2, N3); + ffts_static_rec_i_32f(p, dout + N , N2); + ffts_static_rec_i_32f(p, dout + N + N1 , N2); + } else if (N == 64) { + neon_static_x4_i(dout , 16, ws); + neon_static_x4_i(dout + 64, 16, ws); + neon_static_x4_i(dout + 96, 16, ws); + } - neon_static_x8_t_i(dout, N, ws); + neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 1); -- cgit v1.1