From e464bcb622d5ab1426b14a2314d852fc6e1539e1 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 29 Mar 2016 17:01:01 +0300 Subject: Fix neon_transpose8 for non-square matrices, move loops to assembly side, about 5% faster --- src/ffts_nd.c | 8 +- src/neon.s | 248 +++++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 194 insertions(+), 62 deletions(-) diff --git a/src/ffts_nd.c b/src/ffts_nd.c index ebce101..5745cd5 100644 --- a/src/ffts_nd.c +++ b/src/ffts_nd.c @@ -94,13 +94,7 @@ ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) #if 0 neon_transpose4(in, out, w, h); #else - size_t i, j; - - for (j = 0; j < h; j += 8) { - for (i = 0; i < w; i += 8) { - neon_transpose8(in + j*w + i, out + i*h + j, w, h); - } - } + neon_transpose8(in, out, w, h); #endif #else #ifdef HAVE_SSE diff --git a/src/neon.s b/src/neon.s index 9b6ccab..7486b63 100644 --- a/src/neon.s +++ b/src/neon.s @@ -682,111 +682,249 @@ _neon_transpose8: .globl neon_transpose8 neon_transpose8: #endif - push {r4-r8, lr} + push {r4-r12, lr} vpush {q4-q7} - @ initialize and preload (TODO: optimize) + @ initialize + lsl r2, r2, #3 + mul lr, r2, r3 + lsl r3, r3, #5 + add r4, r0, r2 + lsl ip, r2, #1 + add r5, r1, r3, lsr #2 + add r6, r1, r3, lsr #1 + add r7, r5, r3, lsr #1 + sub lr, r3, lr + sub ip, ip, #64 + sub r8, r3, #48 + add lr, lr, #16 +1: + @ process all but the last one + subs r11, r2, #64 + + @ prefetch next rows 0-5 pld [r0] - add r4, r0, r2, lsl #3 - lsl ip, r2, #4 pld [r4] - pld [r0, ip] - add r6, r1, r2, lsl #3 - pld [r4, ip] - add r7, r6, r2, lsl #3 - pld [r0, ip, lsl #1] - pld [r0, ip, lsl #2] - add r8, r7, r2, lsl #3 - pld [r4, ip, lsl #1] - pld [r4, ip, lsl #2] - sub ip, ip, #32 - lsl r3, ip, #1 - add r3, r3, #16 + pld [r0, r2, lsl #1] + pld [r4, r2, lsl #1] + pld [r0, r2, lsl #2] + pld [r4, r2, lsl #2] + @ if there is only the last one + beq 3f +2: @ matrix 0&2 row 0-1 vld1.32 {q0, q1}, [r0, :128]! vld1.32 {q2, q3}, [r4, :128]! vswp d1, d4 vswp d3, d6 vst1.32 {q0}, [r1, :128]! - vst1.32 {q2}, [r6, :128]! - vst1.32 {q1}, [r7, :128]! - vst1.32 {q3}, [r8, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! @ matrix 1&3 row 0-1 - vld1.32 {q4, q5}, [r0, :128], ip - vld1.32 {q6, q7}, [r4, :128], ip + vld1.32 {q4, q5}, [r0, :128]! + vld1.32 {q6, q7}, [r4, :128]! vswp d9, d12 vswp d11, d14 + @ prefetch next rows 0-1 + pld [r0] + pld [r4] + add r9, r0, ip + add r10, r4, ip + @ matrix 0&2, row 2-3 - vld1.32 {q0, q1}, [r0, :128]! - vld1.32 {q2, q3}, [r4, :128]! + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! vswp d1, d4 vswp d3, d6 vst1.32 {q0}, [r1, :128]! - vst1.32 {q2}, [r6, :128]! - vst1.32 {q1}, [r7, :128]! - vst1.32 {q3}, [r8, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! @ matrix 1&3, row 2-3 - vld1.32 {q8, q9}, [r0, :128], ip - vld1.32 {q10, q11}, [r4, :128], ip + vld1.32 {q8, q9}, [r9, :128]! + vld1.32 {q10, q11}, [r10, :128]! vswp d17, d20 vswp d19, d22 + @ prefetch next rows 2-3 + pld [r9] + pld [r10] + add r9, r9, ip + add r10, r10, ip + @ matrix 0&2, row 4-5 - vld1.32 {q0, q1}, [r0, :128]! - vld1.32 {q2, q3}, [r4, :128]! + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! vswp d1, d4 vswp d3, d6 vst1.32 {q0}, [r1, :128]! - vst1.32 {q2}, [r6, :128]! - vst1.32 {q1}, [r7, :128]! - vst1.32 {q3}, [r8, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! @ matrix 1&3, row 4-5 - vld1.32 {q12, q13}, [r0, :128], ip - vld1.32 {q14, q15}, [r4, :128], ip + vld1.32 {q12, q13}, [r9, :128]! + vld1.32 {q14, q15}, [r10, :128]! vswp d25, d28 vswp d27, d30 + @ prefetch next rows 4-5 + pld [r9] + pld [r10] + add r9, r9, ip + add r10, r10, ip + @ matrix 0&2, row 6-7 + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! + vswp d1, d4 + vswp d3, d6 + vst1.32 {q0}, [r1, :128], r8 + vst1.32 {q2}, [r5, :128], r8 + vst1.32 {q1}, [r6, :128], r8 + vst1.32 {q3}, [r7, :128], r8 + + @ matrix 1&3, row 6-7 + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! + vswp d1, d4 + vswp d3, d6 + + @ prefetch next rows 6-7 + pld [r9] + pld [r10] + + subs r11, r11, #64 + + @ these could be replaced with VSTM, but that requires swaps + vst1.32 {q4}, [r1, :128]! + vst1.32 {q8}, [r1, :128]! + vst1.32 {q12}, [r1, :128]! + vst1.32 {q0}, [r1, :128], r8 + + vst1.32 {q6}, [r5, :128]! + vst1.32 {q10}, [r5, :128]! + vst1.32 {q14}, [r5, :128]! + vst1.32 {q2}, [r5, :128], r8 + + vst1.32 {q5}, [r6, :128]! + vst1.32 {q9}, [r6, :128]! + vst1.32 {q13}, [r6, :128]! + vst1.32 {q1}, [r6, :128], r8 + + vst1.32 {q7}, [r7, :128]! + vst1.32 {q11}, [r7, :128]! + vst1.32 {q15}, [r7, :128]! + vst1.32 {q3}, [r7, :128], r8 + + @ process all but the last on row + bne 2b +3: + @ process the last one + subs r3, r3, #256 + + @ matrix 0&2 row 0-1 vld1.32 {q0, q1}, [r0, :128]! vld1.32 {q2, q3}, [r4, :128]! vswp d1, d4 vswp d3, d6 - vst1.32 {q0}, [r1, :128], r3 - vst1.32 {q2}, [r6, :128], r3 - vst1.32 {q1}, [r7, :128], r3 - vst1.32 {q3}, [r8, :128], r3 + vst1.32 {q0}, [r1, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! + + @ matrix 1&3 row 0-1 + vld1.32 {q4, q5}, [r0, :128]! + vld1.32 {q6, q7}, [r4, :128]! + vswp d9, d12 + vswp d11, d14 + add r9, r0, ip + add r10, r4, ip + + @ matrix 0&2, row 2-3 + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! + vswp d1, d4 + vswp d3, d6 + vst1.32 {q0}, [r1, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! + + @ matrix 1&3, row 2-3 + vld1.32 {q8, q9}, [r9, :128]! + vld1.32 {q10, q11}, [r10, :128]! + vswp d17, d20 + vswp d19, d22 + add r9, r9, ip + add r10, r10, ip + + @ matrix 0&2, row 4-5 + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! + vswp d1, d4 + vswp d3, d6 + vst1.32 {q0}, [r1, :128]! + vst1.32 {q2}, [r5, :128]! + vst1.32 {q1}, [r6, :128]! + vst1.32 {q3}, [r7, :128]! + + @ matrix 1&3, row 4-5 + vld1.32 {q12, q13}, [r9, :128]! + vld1.32 {q14, q15}, [r10, :128]! + vswp d25, d28 + vswp d27, d30 + add r9, r9, ip + add r10, r10, ip + + @ matrix 0&2, row 6-7 + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! + vswp d1, d4 + vswp d3, d6 + vst1.32 {q0}, [r1, :128], r8 + vst1.32 {q2}, [r5, :128], r8 + vst1.32 {q1}, [r6, :128], r8 + vst1.32 {q3}, [r7, :128], r8 @ matrix 1&3, row 6-7 - vld1.32 {q0, q1}, [r0, :128] - vld1.32 {q2, q3}, [r4, :128] + vld1.32 {q0, q1}, [r9, :128]! + vld1.32 {q2, q3}, [r10, :128]! vswp d1, d4 vswp d3, d6 + @ next row starts right after + mov r0, r10 + add r4, r10, r2 + @ these could be replaced with VSTM, but that requires swaps vst1.32 {q4}, [r1, :128]! vst1.32 {q8}, [r1, :128]! vst1.32 {q12}, [r1, :128]! - vst1.32 {q0}, [r1, :128] + vst1.32 {q0}, [r1, :128], lr + + vst1.32 {q6}, [r5, :128]! + vst1.32 {q10}, [r5, :128]! + vst1.32 {q14}, [r5, :128]! + vst1.32 {q2}, [r5, :128], lr - vst1.32 {q6}, [r6, :128]! - vst1.32 {q10}, [r6, :128]! - vst1.32 {q14}, [r6, :128]! - vst1.32 {q2}, [r6, :128] + vst1.32 {q5}, [r6, :128]! + vst1.32 {q9}, [r6, :128]! + vst1.32 {q13}, [r6, :128]! + vst1.32 {q1}, [r6, :128], lr - vst1.32 {q5}, [r7, :128]! - vst1.32 {q9}, [r7, :128]! - vst1.32 {q13}, [r7, :128]! - vst1.32 {q1}, [r7, :128] + vst1.32 {q7}, [r7, :128]! + vst1.32 {q11}, [r7, :128]! + vst1.32 {q15}, [r7, :128]! + vst1.32 {q3}, [r7, :128], lr - vst1.32 {q7}, [r8, :128]! - vst1.32 {q11}, [r8, :128]! - vst1.32 {q15}, [r8, :128]! - vst1.32 {q3}, [r8, :128] + @ process all columns + bne 1b vpop {q4-q7} - pop {r4-r8, pc} + pop {r4-r12, pc} -- cgit v1.1