summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-15 11:08:23 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-15 11:08:23 +0200
commitf64d89a9385e5981a3e175a205ee3fdf69773e61 (patch)
tree7e5045c78afec2da80b70fde3bc58e22a4484024
parent85a7167fdb139dc249330d34120080109878ea8f (diff)
downloadffts-f64d89a9385e5981a3e175a205ee3fdf69773e61.zip
ffts-f64d89a9385e5981a3e175a205ee3fdf69773e61.tar.gz
neon_static_x4_f and neon_static_x4_i don't use the second passed argument, and reschedule instructions for possible dual issue
-rw-r--r--src/ffts_static.c86
-rw-r--r--src/neon.h4
-rw-r--r--src/neon_static_f.s88
-rw-r--r--src/neon_static_i.s88
4 files changed, 140 insertions, 126 deletions
diff --git a/src/ffts_static.c b/src/ffts_static.c
index 00cc96f..e1b2f6b 100644
--- a/src/ffts_static.c
+++ b/src/ffts_static.c
@@ -967,17 +967,19 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_f(data , 32, ws1);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x8_f(data, 32, ws1);
+
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
+
neon_static_x8_f(data + 128, 32, ws1);
neon_static_x8_f(data + 192, 32, ws1);
neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_f(data , 16, ws);
- neon_static_x4_f(data + 64, 16, ws);
- neon_static_x4_f(data + 96, 16, ws);
+ neon_static_x4_f(data , ws);
+ neon_static_x4_f(data + 64, ws);
+ neon_static_x4_f(data + 96, ws);
neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1041,17 +1043,19 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
} else if (N == 128) {
const float *ws1 = ws + (p->ws_is[1] << 1);
- neon_static_x8_i(data , 32, ws1);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x8_i(data, 32, ws1);
+
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
+
neon_static_x8_i(data + 128, 32, ws1);
neon_static_x8_i(data + 192, 32, ws1);
neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
- neon_static_x4_i(data , 16, ws);
- neon_static_x4_i(data + 64, 16, ws);
- neon_static_x4_i(data + 96, 16, ws);
+ neon_static_x4_i(data , ws);
+ neon_static_x4_i(data + 64, ws);
+ neon_static_x4_i(data + 96, ws);
neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1));
} else {
@@ -1122,21 +1126,28 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_f_32f(p, dout + N1 + N2, N3);
ffts_static_rec_f_32f(p, dout + N , N2);
ffts_static_rec_f_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_f(dout, 32, ws + 8);
+
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
+
+ neon_static_x8_f(dout + 128, 32, ws + 8);
+ neon_static_x8_f(dout + 192, 32, ws + 8);
- neon_static_x8_f(dout , 32, ws1);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- neon_static_x8_f(dout + 128, 32, ws1);
- neon_static_x8_f(dout + 192, 32, ws1);
+ neon_static_x8_t_f(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_f(dout , 16, ws);
- neon_static_x4_f(dout + 64, 16, ws);
- neon_static_x4_f(dout + 96, 16, ws);
- }
+ neon_static_x4_f(dout , ws);
+ neon_static_x4_f(dout + 64, ws);
+ neon_static_x4_f(dout + 96, ws);
- neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_f(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_f(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 0);
@@ -1176,21 +1187,28 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_i_32f(p, dout + N1 + N2, N3);
ffts_static_rec_i_32f(p, dout + N , N2);
ffts_static_rec_i_32f(p, dout + N + N1 , N2);
+
+ neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
} else if (N == 128) {
- const float *ws1 = ws + (p->ws_is[1] << 1);
+ neon_static_x8_i(dout, 32, ws + 8);
+
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_i(dout , 32, ws1);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- neon_static_x8_i(dout + 128, 32, ws1);
- neon_static_x8_i(dout + 192, 32, ws1);
+ neon_static_x8_i(dout + 128, 32, ws + 8);
+ neon_static_x8_i(dout + 192, 32, ws + 8);
+
+ neon_static_x8_t_i(dout, 128, ws + 80);
} else if (N == 64) {
- neon_static_x4_i(dout , 16, ws);
- neon_static_x4_i(dout + 64, 16, ws);
- neon_static_x4_i(dout + 96, 16, ws);
- }
+ neon_static_x4_i(dout , ws);
+ neon_static_x4_i(dout + 64, ws);
+ neon_static_x4_i(dout + 96, ws);
- neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1));
+ neon_static_x8_t_i(dout, 64, ws + 32);
+ } else {
+ assert(N == 32);
+ neon_static_x8_t_i(dout, 32, ws + 8);
+ }
#else
if (N_log_2 & 1) {
ffts_static_firstpass_odd_32f(dout, din, p, 1);
diff --git a/src/neon.h b/src/neon.h
index b40623b..66dcd4b 100644
--- a/src/neon.h
+++ b/src/neon.h
@@ -50,13 +50,13 @@ void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w);
void neon_static_e_f(ffts_plan_t*, const void*, void*);
void neon_static_o_f(ffts_plan_t*, const void*, void*);
-void neon_static_x4_f(float*, size_t, const float*);
+void neon_static_x4_f(float*, const float*);
void neon_static_x8_f(float*, size_t, const float*);
void neon_static_x8_t_f(float*, size_t, const float*);
void neon_static_e_i(ffts_plan_t*, const void*, void*);
void neon_static_o_i(ffts_plan_t*, const void*, void*);
-void neon_static_x4_i(float*, size_t, const float*);
+void neon_static_x4_i(float*, const float*);
void neon_static_x8_i(float*, size_t, const float*);
void neon_static_x8_t_i(float*, size_t, const float*);
diff --git a/src/neon_static_f.s b/src/neon_static_f.s
index bb0d717..e1e9a4a 100644
--- a/src/neon_static_f.s
+++ b/src/neon_static_f.s
@@ -652,59 +652,57 @@ _neon_ee_o_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
- .align 4
+ .align 4
#ifdef __APPLE__
- .globl _neon_static_x4_f
+ .globl _neon_static_x4_f
_neon_static_x4_f:
#else
- .globl neon_static_x4_f
+ .globl neon_static_x4_f
neon_static_x4_f:
#endif
-@ add r3, r0, #0
- push {r4, r5, r6, lr}
- vstmdb sp!, {d8-d15}
+ add r3, r0, #64
+ vpush {q4-q7}
- vld1.32 {q8,q9}, [r0, :128]
- add r4, r0, r1, lsl #1
- vld1.32 {q10,q11}, [r4, :128]
- add r5, r0, r1, lsl #2
- vld1.32 {q12,q13}, [r5, :128]
- add r6, r4, r1, lsl #2
- vld1.32 {q14,q15}, [r6, :128]
- vld1.32 {q2,q3}, [r2, :128]
-
- vmul.f32 q0, q13, q3
- vmul.f32 q5, q12, q2
- vmul.f32 q1, q14, q2
- vmul.f32 q4, q14, q3
- vmul.f32 q14, q12, q3
- vmul.f32 q13, q13, q2
- vmul.f32 q12, q15, q3
- vmul.f32 q2, q15, q2
- vsub.f32 q0, q5, q0
- vadd.f32 q13, q13, q14
- vadd.f32 q12, q12, q1
- vsub.f32 q1, q2, q4
- vadd.f32 q15, q0, q12
- vsub.f32 q12, q0, q12
- vadd.f32 q14, q13, q1
- vsub.f32 q13, q13, q1
- vadd.f32 q0, q8, q15
- vadd.f32 q1, q9, q14
- vadd.f32 q2, q10, q13 @
- vsub.f32 q4, q8, q15
- vsub.f32 q3, q11, q12 @
- vst1.32 {q0,q1}, [r0, :128]
- vsub.f32 q5, q9, q14
- vsub.f32 q6, q10, q13 @
- vadd.f32 q7, q11, q12 @
- vst1.32 {q2,q3}, [r4, :128]
- vst1.32 {q4,q5}, [r5, :128]
- vst1.32 {q6,q7}, [r6, :128]
- vldmia sp!, {d8-d15}
- pop {r4, r5, r6, pc}
+ vld1.32 {q2, q3}, [r1, :128]
+ vld1.32 {q12, q13}, [r3, :128]!
+ mov r2, r0
+ vmul.f32 q0, q13, q3
+ vld1.32 {q14, q15}, [r3, :128]
+ vmul.f32 q5, q12, q2
+ vld1.32 {q8, q9}, [r0, :128]!
+ vmul.f32 q1, q14, q2
+ vld1.32 {q10, q11}, [r0, :128]
+ vmul.f32 q4, q14, q3
+ vmul.f32 q14, q12, q3
+ vmul.f32 q13, q13, q2
+ vmul.f32 q12, q15, q3
+ vmul.f32 q2, q15, q2
+ vsub.f32 q0, q5, q0
+ vadd.f32 q13, q13, q14
+ vadd.f32 q12, q12, q1
+ vsub.f32 q1, q2, q4
+ vadd.f32 q15, q0, q12
+ vsub.f32 q12, q0, q12
+ vadd.f32 q14, q13, q1
+ vsub.f32 q13, q13, q1
+ vadd.f32 q0, q8, q15
+ vadd.f32 q1, q9, q14
+ vadd.f32 q2, q10, q13
+ vsub.f32 q4, q8, q15
+ vsub.f32 q3, q11, q12
+
+ vst1.32 {q0, q1}, [r2, :128]!
+
+ vsub.f32 q5, q9, q14
+ vsub.f32 q6, q10, q13
+ vadd.f32 q7, q11, q12
+ vst1.32 {q2, q3}, [r2, :128]!
+ vst1.32 {q4, q5}, [r2, :128]!
+ vst1.32 {q6, q7}, [r2, :128]
+ vpop {q4-q7}
+ bx lr
.align 4
#ifdef __APPLE__
diff --git a/src/neon_static_i.s b/src/neon_static_i.s
index 5edc908..d8f8d9c 100644
--- a/src/neon_static_i.s
+++ b/src/neon_static_i.s
@@ -651,59 +651,57 @@ _neon_ee_o_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
- .align 4
+ .align 4
#ifdef __APPLE__
- .globl _neon_static_x4_i
+ .globl _neon_static_x4_i
_neon_static_x4_i:
#else
- .globl neon_static_x4_i
+ .globl neon_static_x4_i
neon_static_x4_i:
#endif
-@ add r3, r0, #0
- push {r4, r5, r6, lr}
- vstmdb sp!, {d8-d15}
+ add r3, r0, #64
+ vpush {q4-q7}
- vld1.32 {q8,q9}, [r0, :128]
- add r4, r0, r1, lsl #1
- vld1.32 {q10,q11}, [r4, :128]
- add r5, r0, r1, lsl #2
- vld1.32 {q12,q13}, [r5, :128]
- add r6, r4, r1, lsl #2
- vld1.32 {q14,q15}, [r6, :128]
- vld1.32 {q2,q3}, [r2, :128]
-
- vmul.f32 q0, q13, q3
- vmul.f32 q5, q12, q2
- vmul.f32 q1, q14, q2
- vmul.f32 q4, q14, q3
- vmul.f32 q14, q12, q3
- vmul.f32 q13, q13, q2
- vmul.f32 q12, q15, q3
- vmul.f32 q2, q15, q2
- vsub.f32 q0, q5, q0
- vadd.f32 q13, q13, q14
- vadd.f32 q12, q12, q1
- vsub.f32 q1, q2, q4
- vadd.f32 q15, q0, q12
- vsub.f32 q12, q0, q12
- vadd.f32 q14, q13, q1
- vsub.f32 q13, q13, q1
- vadd.f32 q0, q8, q15
- vadd.f32 q1, q9, q14
- vsub.f32 q2, q10, q13 @
- vsub.f32 q4, q8, q15
- vadd.f32 q3, q11, q12 @
- vst1.32 {q0,q1}, [r0, :128]
- vsub.f32 q5, q9, q14
- vadd.f32 q6, q10, q13 @
- vsub.f32 q7, q11, q12 @
- vst1.32 {q2,q3}, [r4, :128]
- vst1.32 {q4,q5}, [r5, :128]
- vst1.32 {q6,q7}, [r6, :128]
- vldmia sp!, {d8-d15}
- pop {r4, r5, r6, pc}
+ vld1.32 {q2, q3}, [r1, :128]
+ vld1.32 {q12, q13}, [r3, :128]!
+ mov r2, r0
+ vmul.f32 q0, q13, q3
+ vld1.32 {q14, q15}, [r3, :128]
+ vmul.f32 q5, q12, q2
+ vld1.32 {q8, q9}, [r0, :128]!
+ vmul.f32 q1, q14, q2
+ vld1.32 {q10, q11}, [r0, :128]
+ vmul.f32 q4, q14, q3
+ vmul.f32 q14, q12, q3
+ vmul.f32 q13, q13, q2
+ vmul.f32 q12, q15, q3
+ vmul.f32 q2, q15, q2
+ vsub.f32 q0, q5, q0
+ vadd.f32 q13, q13, q14
+ vadd.f32 q12, q12, q1
+ vsub.f32 q1, q2, q4
+ vadd.f32 q15, q0, q12
+ vsub.f32 q12, q0, q12
+ vadd.f32 q14, q13, q1
+ vsub.f32 q13, q13, q1
+ vadd.f32 q0, q8, q15
+ vadd.f32 q1, q9, q14
+ vsub.f32 q2, q10, q13
+ vsub.f32 q4, q8, q15
+ vadd.f32 q3, q11, q12
+
+ vst1.32 {q0, q1}, [r2, :128]!
+
+ vsub.f32 q5, q9, q14
+ vadd.f32 q6, q10, q13
+ vsub.f32 q7, q11, q12
+ vst1.32 {q2, q3}, [r2, :128]!
+ vst1.32 {q4, q5}, [r2, :128]!
+ vst1.32 {q6, q7}, [r2, :128]
+ vpop {q4-q7}
+ bx lr
.align 4
#ifdef __APPLE__
OpenPOWER on IntegriCloud