summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-11 14:32:22 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-11 14:32:22 +0200
commite667ca5e4304b31cd7093eaead481b032092b985 (patch)
treedf057e6fa4502d1924eddb9bf496e5e9d338a417
parent2051c214d591be08e40fdba623ccefabbba11b29 (diff)
downloadffts-e667ca5e4304b31cd7093eaead481b032092b985.zip
ffts-e667ca5e4304b31cd7093eaead481b032092b985.tar.gz
Restore ARM NEON optimized recursive version
-rw-r--r--src/ffts.c14
-rw-r--r--src/ffts_static.c84
2 files changed, 85 insertions, 13 deletions
diff --git a/src/ffts.c b/src/ffts.c
index a22a1c8..5d72a52 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -55,7 +55,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
-#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
+#if defined(HAVE_NEON)
static const FFTS_ALIGN(64) float w_data[16] = {
0.70710678118654757273731092936941f,
0.70710678118654746171500846685376f,
@@ -227,7 +227,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
if (n_luts) {
size_t lut_size;
-#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
+#if defined(__arm__) && !defined(HAVE_NEON)
lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f) / 2;
#else
lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
@@ -272,7 +272,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w0[j][1] = tmp[j * stride][1];
}
-#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
+#if defined(__arm__)
#ifdef HAVE_NEON
for (j = 0; j < n/4; j += 4) {
V4SF2 temp0 = V4SF2_LD(fw0 + j*2);
@@ -323,7 +323,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w2[j][1] = tmp[(j + (n/8)) * stride][1];
}
-#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
+#if defined(__arm__)
#ifdef HAVE_NEON
for (j = 0; j < n/8; j += 4) {
V4SF2 temp0, temp1, temp2;
@@ -389,11 +389,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
stride >>= 1;
}
-#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
+#if defined(HAVE_NEON)
if (sign < 0) {
- p->oe_ws = (void*)(&w_data[4]);
+ p->oe_ws = (void*)(w_data + 4);
p->ee_ws = (void*)(w_data);
- p->eo_ws = (void*)(&w_data[4]);
+ p->eo_ws = (void*)(w_data + 4);
} else {
p->oe_ws = (void*)(w_data + 12);
p->ee_ws = (void*)(w_data + 8);
diff --git a/src/ffts_static.c b/src/ffts_static.c
index 701cca8..7747de0 100644
--- a/src/ffts_static.c
+++ b/src/ffts_static.c
@@ -36,6 +36,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_internal.h"
#include "macros.h"
+#if defined(HAVE_NEON)
+#include "neon.h"
+#endif
+
#include <assert.h>
static const FFTS_ALIGN(16) float ffts_constants_small_32f[24] = {
@@ -945,6 +949,28 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
static void
ffts_static_rec_f_32f(ffts_plan_t *p, float *data, size_t N)
{
+#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
+ if (N > 16) {
+ size_t N1 = N >> 1;
+ size_t N2 = N >> 2;
+ size_t N3 = N >> 3;
+ float *ws = ((float *)(p->ws)) + (p->ws_is[ffts_ctzl(N)-4] << 1);
+
+ ffts_static_rec_f_32f(p, data, N2);
+ ffts_static_rec_f_32f(p, data + N1, N3);
+ ffts_static_rec_f_32f(p, data + N1 + N2, N3);
+ ffts_static_rec_f_32f(p, data + N, N2);
+ ffts_static_rec_f_32f(p, data + N + N1, N2);
+
+ if (N == p->N) {
+ neon_static_x8_t_f(data, N, ws);
+ } else {
+ neon_static_x8_f(data, N, ws);
+ }
+ } else if (N == 16) {
+ neon_static_x4_f(data, N, p->ws);
+ }
+#else
const float *ws = (float*) p->ws;
if (N > 128) {
@@ -983,11 +1009,34 @@ ffts_static_rec_f_32f(ffts_plan_t *p, float *data, size_t N)
assert(N == 16);
V4SF_X_4(0, data, N, ws);
}
+#endif
}
static void
ffts_static_rec_i_32f(ffts_plan_t *p, float *data, size_t N)
{
+#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
+ if (N > 16) {
+ size_t N1 = N >> 1;
+ size_t N2 = N >> 2;
+ size_t N3 = N >> 3;
+ float *ws = ((float *)(p->ws)) + (p->ws_is[ffts_ctzl(N)-4] << 1);
+
+ ffts_static_rec_i_32f(p, data, N2);
+ ffts_static_rec_i_32f(p, data + N1, N3);
+ ffts_static_rec_i_32f(p, data + N1 + N2, N3);
+ ffts_static_rec_i_32f(p, data + N, N2);
+ ffts_static_rec_i_32f(p, data + N + N1, N2);
+
+ if (N == p->N) {
+ neon_static_x8_t_i(data, N, ws);
+ } else {
+ neon_static_x8_i(data, N, ws);
+ }
+ } else if(N==16) {
+ neon_static_x4_i(data, N, p->ws);
+ }
+#else
float *ws = (float*) p->ws;
if (N > 128) {
@@ -1026,28 +1075,51 @@ ffts_static_rec_i_32f(ffts_plan_t *p, float *data, size_t N)
assert(N == 16);
V4SF_X_4(1, data, N, ws);
}
+#endif
}
void
ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
{
+ const float *din = (const float*) in;
+ float *dout = (float*) out;
+
+#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
if (ffts_ctzl(p->N) & 1) {
- ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 0);
+ neon_static_o_f(p, din, dout);
} else {
- ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 0);
+ neon_static_e_f(p, din, dout);
}
+#else
+ if (ffts_ctzl(p->N) & 1) {
+ ffts_static_firstpass_odd_32f(dout, din, p, 0);
+ } else {
+ ffts_static_firstpass_even_32f(dout, din, p, 0);
+ }
+#endif
- ffts_static_rec_f_32f(p, (float*) out, p->N);
+ ffts_static_rec_f_32f(p, dout, p->N);
}
void
ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
{
+ const float *din = (const float*) in;
+ float *dout = (float*) out;
+
+#if defined(HAVE_NEON) && defined(DYNAMIC_DISABLED)
+ if (ffts_ctzl(p->N) & 1) {
+ neon_static_o_i(p, din, dout);
+ } else {
+ neon_static_e_i(p, din, dout);
+ }
+#else
if (ffts_ctzl(p->N) & 1) {
- ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 1);
+ ffts_static_firstpass_odd_32f(dout, din, p, 1);
} else {
- ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 1);
+ ffts_static_firstpass_even_32f(dout, din, p, 1);
}
+#endif
- ffts_static_rec_i_32f(p, (float*) out, p->N);
+ ffts_static_rec_i_32f(p, dout, p->N);
} \ No newline at end of file
OpenPOWER on IntegriCloud