From f64d89a9385e5981a3e175a205ee3fdf69773e61 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 15 Mar 2016 11:08:23 +0200 Subject: neon_static_x4_f and neon_static_x4_i don't use the second passed argument, and reschedule instructions for possible dual issue --- src/ffts_static.c | 86 ++++++++++++++++++++++++++++++--------------------- src/neon.h | 4 +-- src/neon_static_f.s | 88 ++++++++++++++++++++++++++--------------------------- src/neon_static_i.s | 88 ++++++++++++++++++++++++++--------------------------- 4 files changed, 140 insertions(+), 126 deletions(-) diff --git a/src/ffts_static.c b/src/ffts_static.c index 00cc96f..e1b2f6b 100644 --- a/src/ffts_static.c +++ b/src/ffts_static.c @@ -967,17 +967,19 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N) } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - neon_static_x8_f(data , 32, ws1); - neon_static_x4_f(data + 64, 16, ws); - neon_static_x4_f(data + 96, 16, ws); + neon_static_x8_f(data, 32, ws1); + + neon_static_x4_f(data + 64, ws); + neon_static_x4_f(data + 96, ws); + neon_static_x8_f(data + 128, 32, ws1); neon_static_x8_f(data + 192, 32, ws1); neon_static_x8_f(data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - neon_static_x4_f(data , 16, ws); - neon_static_x4_f(data + 64, 16, ws); - neon_static_x4_f(data + 96, 16, ws); + neon_static_x4_f(data , ws); + neon_static_x4_f(data + 64, ws); + neon_static_x4_f(data + 96, ws); neon_static_x8_f(data, 64, ws + (p->ws_is[2] << 1)); } else { @@ -1041,17 +1043,19 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N) } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - neon_static_x8_i(data , 32, ws1); - neon_static_x4_i(data + 64, 16, ws); - neon_static_x4_i(data + 96, 16, ws); + neon_static_x8_i(data, 32, ws1); + + neon_static_x4_i(data + 64, ws); + neon_static_x4_i(data + 96, ws); + neon_static_x8_i(data + 128, 32, ws1); neon_static_x8_i(data + 192, 32, ws1); neon_static_x8_i(data, 128, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - neon_static_x4_i(data , 16, ws); - neon_static_x4_i(data + 64, 16, ws); - neon_static_x4_i(data + 96, 16, ws); + neon_static_x4_i(data , ws); + neon_static_x4_i(data + 64, ws); + neon_static_x4_i(data + 96, ws); neon_static_x8_i(data, 64, ws + (p->ws_is[2] << 1)); } else { @@ -1122,21 +1126,28 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) ffts_static_rec_f_32f(p, dout + N1 + N2, N3); ffts_static_rec_f_32f(p, dout + N , N2); ffts_static_rec_f_32f(p, dout + N + N1 , N2); + + neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); } else if (N == 128) { - const float *ws1 = ws + (p->ws_is[1] << 1); + neon_static_x8_f(dout, 32, ws + 8); + + neon_static_x4_f(dout + 64, ws); + neon_static_x4_f(dout + 96, ws); + + neon_static_x8_f(dout + 128, 32, ws + 8); + neon_static_x8_f(dout + 192, 32, ws + 8); - neon_static_x8_f(dout , 32, ws1); - neon_static_x4_f(dout + 64, 16, ws); - neon_static_x4_f(dout + 96, 16, ws); - neon_static_x8_f(dout + 128, 32, ws1); - neon_static_x8_f(dout + 192, 32, ws1); + neon_static_x8_t_f(dout, 128, ws + 80); } else if (N == 64) { - neon_static_x4_f(dout , 16, ws); - neon_static_x4_f(dout + 64, 16, ws); - neon_static_x4_f(dout + 96, 16, ws); - } + neon_static_x4_f(dout , ws); + neon_static_x4_f(dout + 64, ws); + neon_static_x4_f(dout + 96, ws); - neon_static_x8_t_f(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); + neon_static_x8_t_f(dout, 64, ws + 32); + } else { + assert(N == 32); + neon_static_x8_t_f(dout, 32, ws + 8); + } #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 0); @@ -1176,21 +1187,28 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) ffts_static_rec_i_32f(p, dout + N1 + N2, N3); ffts_static_rec_i_32f(p, dout + N , N2); ffts_static_rec_i_32f(p, dout + N + N1 , N2); + + neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); } else if (N == 128) { - const float *ws1 = ws + (p->ws_is[1] << 1); + neon_static_x8_i(dout, 32, ws + 8); + + neon_static_x4_i(dout + 64, ws); + neon_static_x4_i(dout + 96, ws); - neon_static_x8_i(dout , 32, ws1); - neon_static_x4_i(dout + 64, 16, ws); - neon_static_x4_i(dout + 96, 16, ws); - neon_static_x8_i(dout + 128, 32, ws1); - neon_static_x8_i(dout + 192, 32, ws1); + neon_static_x8_i(dout + 128, 32, ws + 8); + neon_static_x8_i(dout + 192, 32, ws + 8); + + neon_static_x8_t_i(dout, 128, ws + 80); } else if (N == 64) { - neon_static_x4_i(dout , 16, ws); - neon_static_x4_i(dout + 64, 16, ws); - neon_static_x4_i(dout + 96, 16, ws); - } + neon_static_x4_i(dout , ws); + neon_static_x4_i(dout + 64, ws); + neon_static_x4_i(dout + 96, ws); - neon_static_x8_t_i(dout, N, ws + (p->ws_is[N_log_2 - 4] << 1)); + neon_static_x8_t_i(dout, 64, ws + 32); + } else { + assert(N == 32); + neon_static_x8_t_i(dout, 32, ws + 8); + } #else if (N_log_2 & 1) { ffts_static_firstpass_odd_32f(dout, din, p, 1); diff --git a/src/neon.h b/src/neon.h index b40623b..66dcd4b 100644 --- a/src/neon.h +++ b/src/neon.h @@ -50,13 +50,13 @@ void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w); void neon_static_e_f(ffts_plan_t*, const void*, void*); void neon_static_o_f(ffts_plan_t*, const void*, void*); -void neon_static_x4_f(float*, size_t, const float*); +void neon_static_x4_f(float*, const float*); void neon_static_x8_f(float*, size_t, const float*); void neon_static_x8_t_f(float*, size_t, const float*); void neon_static_e_i(ffts_plan_t*, const void*, void*); void neon_static_o_i(ffts_plan_t*, const void*, void*); -void neon_static_x4_i(float*, size_t, const float*); +void neon_static_x4_i(float*, const float*); void neon_static_x8_i(float*, size_t, const float*); void neon_static_x8_t_i(float*, size_t, const float*); diff --git a/src/neon_static_f.s b/src/neon_static_f.s index bb0d717..e1e9a4a 100644 --- a/src/neon_static_f.s +++ b/src/neon_static_f.s @@ -652,59 +652,57 @@ _neon_ee_o_loop2_exit: vldmia sp!, {d8-d15} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .align 4 + .align 4 #ifdef __APPLE__ - .globl _neon_static_x4_f + .globl _neon_static_x4_f _neon_static_x4_f: #else - .globl neon_static_x4_f + .globl neon_static_x4_f neon_static_x4_f: #endif -@ add r3, r0, #0 - push {r4, r5, r6, lr} - vstmdb sp!, {d8-d15} + add r3, r0, #64 + vpush {q4-q7} - vld1.32 {q8,q9}, [r0, :128] - add r4, r0, r1, lsl #1 - vld1.32 {q10,q11}, [r4, :128] - add r5, r0, r1, lsl #2 - vld1.32 {q12,q13}, [r5, :128] - add r6, r4, r1, lsl #2 - vld1.32 {q14,q15}, [r6, :128] - vld1.32 {q2,q3}, [r2, :128] - - vmul.f32 q0, q13, q3 - vmul.f32 q5, q12, q2 - vmul.f32 q1, q14, q2 - vmul.f32 q4, q14, q3 - vmul.f32 q14, q12, q3 - vmul.f32 q13, q13, q2 - vmul.f32 q12, q15, q3 - vmul.f32 q2, q15, q2 - vsub.f32 q0, q5, q0 - vadd.f32 q13, q13, q14 - vadd.f32 q12, q12, q1 - vsub.f32 q1, q2, q4 - vadd.f32 q15, q0, q12 - vsub.f32 q12, q0, q12 - vadd.f32 q14, q13, q1 - vsub.f32 q13, q13, q1 - vadd.f32 q0, q8, q15 - vadd.f32 q1, q9, q14 - vadd.f32 q2, q10, q13 @ - vsub.f32 q4, q8, q15 - vsub.f32 q3, q11, q12 @ - vst1.32 {q0,q1}, [r0, :128] - vsub.f32 q5, q9, q14 - vsub.f32 q6, q10, q13 @ - vadd.f32 q7, q11, q12 @ - vst1.32 {q2,q3}, [r4, :128] - vst1.32 {q4,q5}, [r5, :128] - vst1.32 {q6,q7}, [r6, :128] - vldmia sp!, {d8-d15} - pop {r4, r5, r6, pc} + vld1.32 {q2, q3}, [r1, :128] + vld1.32 {q12, q13}, [r3, :128]! + mov r2, r0 + vmul.f32 q0, q13, q3 + vld1.32 {q14, q15}, [r3, :128] + vmul.f32 q5, q12, q2 + vld1.32 {q8, q9}, [r0, :128]! + vmul.f32 q1, q14, q2 + vld1.32 {q10, q11}, [r0, :128] + vmul.f32 q4, q14, q3 + vmul.f32 q14, q12, q3 + vmul.f32 q13, q13, q2 + vmul.f32 q12, q15, q3 + vmul.f32 q2, q15, q2 + vsub.f32 q0, q5, q0 + vadd.f32 q13, q13, q14 + vadd.f32 q12, q12, q1 + vsub.f32 q1, q2, q4 + vadd.f32 q15, q0, q12 + vsub.f32 q12, q0, q12 + vadd.f32 q14, q13, q1 + vsub.f32 q13, q13, q1 + vadd.f32 q0, q8, q15 + vadd.f32 q1, q9, q14 + vadd.f32 q2, q10, q13 + vsub.f32 q4, q8, q15 + vsub.f32 q3, q11, q12 + + vst1.32 {q0, q1}, [r2, :128]! + + vsub.f32 q5, q9, q14 + vsub.f32 q6, q10, q13 + vadd.f32 q7, q11, q12 + vst1.32 {q2, q3}, [r2, :128]! + vst1.32 {q4, q5}, [r2, :128]! + vst1.32 {q6, q7}, [r2, :128] + vpop {q4-q7} + bx lr .align 4 #ifdef __APPLE__ diff --git a/src/neon_static_i.s b/src/neon_static_i.s index 5edc908..d8f8d9c 100644 --- a/src/neon_static_i.s +++ b/src/neon_static_i.s @@ -651,59 +651,57 @@ _neon_ee_o_loop2_exit: vldmia sp!, {d8-d15} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .align 4 + .align 4 #ifdef __APPLE__ - .globl _neon_static_x4_i + .globl _neon_static_x4_i _neon_static_x4_i: #else - .globl neon_static_x4_i + .globl neon_static_x4_i neon_static_x4_i: #endif -@ add r3, r0, #0 - push {r4, r5, r6, lr} - vstmdb sp!, {d8-d15} + add r3, r0, #64 + vpush {q4-q7} - vld1.32 {q8,q9}, [r0, :128] - add r4, r0, r1, lsl #1 - vld1.32 {q10,q11}, [r4, :128] - add r5, r0, r1, lsl #2 - vld1.32 {q12,q13}, [r5, :128] - add r6, r4, r1, lsl #2 - vld1.32 {q14,q15}, [r6, :128] - vld1.32 {q2,q3}, [r2, :128] - - vmul.f32 q0, q13, q3 - vmul.f32 q5, q12, q2 - vmul.f32 q1, q14, q2 - vmul.f32 q4, q14, q3 - vmul.f32 q14, q12, q3 - vmul.f32 q13, q13, q2 - vmul.f32 q12, q15, q3 - vmul.f32 q2, q15, q2 - vsub.f32 q0, q5, q0 - vadd.f32 q13, q13, q14 - vadd.f32 q12, q12, q1 - vsub.f32 q1, q2, q4 - vadd.f32 q15, q0, q12 - vsub.f32 q12, q0, q12 - vadd.f32 q14, q13, q1 - vsub.f32 q13, q13, q1 - vadd.f32 q0, q8, q15 - vadd.f32 q1, q9, q14 - vsub.f32 q2, q10, q13 @ - vsub.f32 q4, q8, q15 - vadd.f32 q3, q11, q12 @ - vst1.32 {q0,q1}, [r0, :128] - vsub.f32 q5, q9, q14 - vadd.f32 q6, q10, q13 @ - vsub.f32 q7, q11, q12 @ - vst1.32 {q2,q3}, [r4, :128] - vst1.32 {q4,q5}, [r5, :128] - vst1.32 {q6,q7}, [r6, :128] - vldmia sp!, {d8-d15} - pop {r4, r5, r6, pc} + vld1.32 {q2, q3}, [r1, :128] + vld1.32 {q12, q13}, [r3, :128]! + mov r2, r0 + vmul.f32 q0, q13, q3 + vld1.32 {q14, q15}, [r3, :128] + vmul.f32 q5, q12, q2 + vld1.32 {q8, q9}, [r0, :128]! + vmul.f32 q1, q14, q2 + vld1.32 {q10, q11}, [r0, :128] + vmul.f32 q4, q14, q3 + vmul.f32 q14, q12, q3 + vmul.f32 q13, q13, q2 + vmul.f32 q12, q15, q3 + vmul.f32 q2, q15, q2 + vsub.f32 q0, q5, q0 + vadd.f32 q13, q13, q14 + vadd.f32 q12, q12, q1 + vsub.f32 q1, q2, q4 + vadd.f32 q15, q0, q12 + vsub.f32 q12, q0, q12 + vadd.f32 q14, q13, q1 + vsub.f32 q13, q13, q1 + vadd.f32 q0, q8, q15 + vadd.f32 q1, q9, q14 + vsub.f32 q2, q10, q13 + vsub.f32 q4, q8, q15 + vadd.f32 q3, q11, q12 + + vst1.32 {q0, q1}, [r2, :128]! + + vsub.f32 q5, q9, q14 + vadd.f32 q6, q10, q13 + vsub.f32 q7, q11, q12 + vst1.32 {q2, q3}, [r2, :128]! + vst1.32 {q4, q5}, [r2, :128]! + vst1.32 {q6, q7}, [r2, :128] + vpop {q4-q7} + bx lr .align 4 #ifdef __APPLE__ -- cgit v1.1