summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-29 17:01:01 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-29 17:01:01 +0300
commite464bcb622d5ab1426b14a2314d852fc6e1539e1 (patch)
treebbd0158431c0ac2f2d51345d1f777bd30b025341
parentab556f87890e483d1a0814096cad491de270b6ad (diff)
downloadffts-e464bcb622d5ab1426b14a2314d852fc6e1539e1.zip
ffts-e464bcb622d5ab1426b14a2314d852fc6e1539e1.tar.gz
Fix neon_transpose8 for non-square matrices, move loops to assembly side, about 5% faster
-rw-r--r--src/ffts_nd.c8
-rw-r--r--src/neon.s248
2 files changed, 194 insertions, 62 deletions
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index ebce101..5745cd5 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -94,13 +94,7 @@ ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
#if 0
neon_transpose4(in, out, w, h);
#else
- size_t i, j;
-
- for (j = 0; j < h; j += 8) {
- for (i = 0; i < w; i += 8) {
- neon_transpose8(in + j*w + i, out + i*h + j, w, h);
- }
- }
+ neon_transpose8(in, out, w, h);
#endif
#else
#ifdef HAVE_SSE
diff --git a/src/neon.s b/src/neon.s
index 9b6ccab..7486b63 100644
--- a/src/neon.s
+++ b/src/neon.s
@@ -682,111 +682,249 @@ _neon_transpose8:
.globl neon_transpose8
neon_transpose8:
#endif
- push {r4-r8, lr}
+ push {r4-r12, lr}
vpush {q4-q7}
- @ initialize and preload (TODO: optimize)
+ @ initialize
+ lsl r2, r2, #3
+ mul lr, r2, r3
+ lsl r3, r3, #5
+ add r4, r0, r2
+ lsl ip, r2, #1
+ add r5, r1, r3, lsr #2
+ add r6, r1, r3, lsr #1
+ add r7, r5, r3, lsr #1
+ sub lr, r3, lr
+ sub ip, ip, #64
+ sub r8, r3, #48
+ add lr, lr, #16
+1:
+ @ process all but the last one
+ subs r11, r2, #64
+
+ @ prefetch next rows 0-5
pld [r0]
- add r4, r0, r2, lsl #3
- lsl ip, r2, #4
pld [r4]
- pld [r0, ip]
- add r6, r1, r2, lsl #3
- pld [r4, ip]
- add r7, r6, r2, lsl #3
- pld [r0, ip, lsl #1]
- pld [r0, ip, lsl #2]
- add r8, r7, r2, lsl #3
- pld [r4, ip, lsl #1]
- pld [r4, ip, lsl #2]
- sub ip, ip, #32
- lsl r3, ip, #1
- add r3, r3, #16
+ pld [r0, r2, lsl #1]
+ pld [r4, r2, lsl #1]
+ pld [r0, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ @ if there is only the last one
+ beq 3f
+2:
@ matrix 0&2 row 0-1
vld1.32 {q0, q1}, [r0, :128]!
vld1.32 {q2, q3}, [r4, :128]!
vswp d1, d4
vswp d3, d6
vst1.32 {q0}, [r1, :128]!
- vst1.32 {q2}, [r6, :128]!
- vst1.32 {q1}, [r7, :128]!
- vst1.32 {q3}, [r8, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
@ matrix 1&3 row 0-1
- vld1.32 {q4, q5}, [r0, :128], ip
- vld1.32 {q6, q7}, [r4, :128], ip
+ vld1.32 {q4, q5}, [r0, :128]!
+ vld1.32 {q6, q7}, [r4, :128]!
vswp d9, d12
vswp d11, d14
+ @ prefetch next rows 0-1
+ pld [r0]
+ pld [r4]
+ add r9, r0, ip
+ add r10, r4, ip
+
@ matrix 0&2, row 2-3
- vld1.32 {q0, q1}, [r0, :128]!
- vld1.32 {q2, q3}, [r4, :128]!
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
vswp d1, d4
vswp d3, d6
vst1.32 {q0}, [r1, :128]!
- vst1.32 {q2}, [r6, :128]!
- vst1.32 {q1}, [r7, :128]!
- vst1.32 {q3}, [r8, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
@ matrix 1&3, row 2-3
- vld1.32 {q8, q9}, [r0, :128], ip
- vld1.32 {q10, q11}, [r4, :128], ip
+ vld1.32 {q8, q9}, [r9, :128]!
+ vld1.32 {q10, q11}, [r10, :128]!
vswp d17, d20
vswp d19, d22
+ @ prefetch next rows 2-3
+ pld [r9]
+ pld [r10]
+ add r9, r9, ip
+ add r10, r10, ip
+
@ matrix 0&2, row 4-5
- vld1.32 {q0, q1}, [r0, :128]!
- vld1.32 {q2, q3}, [r4, :128]!
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
vswp d1, d4
vswp d3, d6
vst1.32 {q0}, [r1, :128]!
- vst1.32 {q2}, [r6, :128]!
- vst1.32 {q1}, [r7, :128]!
- vst1.32 {q3}, [r8, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
@ matrix 1&3, row 4-5
- vld1.32 {q12, q13}, [r0, :128], ip
- vld1.32 {q14, q15}, [r4, :128], ip
+ vld1.32 {q12, q13}, [r9, :128]!
+ vld1.32 {q14, q15}, [r10, :128]!
vswp d25, d28
vswp d27, d30
+ @ prefetch next rows 4-5
+ pld [r9]
+ pld [r10]
+ add r9, r9, ip
+ add r10, r10, ip
+
@ matrix 0&2, row 6-7
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
+ vswp d1, d4
+ vswp d3, d6
+ vst1.32 {q0}, [r1, :128], r8
+ vst1.32 {q2}, [r5, :128], r8
+ vst1.32 {q1}, [r6, :128], r8
+ vst1.32 {q3}, [r7, :128], r8
+
+ @ matrix 1&3, row 6-7
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
+ vswp d1, d4
+ vswp d3, d6
+
+ @ prefetch next rows 6-7
+ pld [r9]
+ pld [r10]
+
+ subs r11, r11, #64
+
+ @ these could be replaced with VSTM, but that requires swaps
+ vst1.32 {q4}, [r1, :128]!
+ vst1.32 {q8}, [r1, :128]!
+ vst1.32 {q12}, [r1, :128]!
+ vst1.32 {q0}, [r1, :128], r8
+
+ vst1.32 {q6}, [r5, :128]!
+ vst1.32 {q10}, [r5, :128]!
+ vst1.32 {q14}, [r5, :128]!
+ vst1.32 {q2}, [r5, :128], r8
+
+ vst1.32 {q5}, [r6, :128]!
+ vst1.32 {q9}, [r6, :128]!
+ vst1.32 {q13}, [r6, :128]!
+ vst1.32 {q1}, [r6, :128], r8
+
+ vst1.32 {q7}, [r7, :128]!
+ vst1.32 {q11}, [r7, :128]!
+ vst1.32 {q15}, [r7, :128]!
+ vst1.32 {q3}, [r7, :128], r8
+
+ @ process all but the last on row
+ bne 2b
+3:
+ @ process the last one
+ subs r3, r3, #256
+
+ @ matrix 0&2 row 0-1
vld1.32 {q0, q1}, [r0, :128]!
vld1.32 {q2, q3}, [r4, :128]!
vswp d1, d4
vswp d3, d6
- vst1.32 {q0}, [r1, :128], r3
- vst1.32 {q2}, [r6, :128], r3
- vst1.32 {q1}, [r7, :128], r3
- vst1.32 {q3}, [r8, :128], r3
+ vst1.32 {q0}, [r1, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
+
+ @ matrix 1&3 row 0-1
+ vld1.32 {q4, q5}, [r0, :128]!
+ vld1.32 {q6, q7}, [r4, :128]!
+ vswp d9, d12
+ vswp d11, d14
+ add r9, r0, ip
+ add r10, r4, ip
+
+ @ matrix 0&2, row 2-3
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
+ vswp d1, d4
+ vswp d3, d6
+ vst1.32 {q0}, [r1, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
+
+ @ matrix 1&3, row 2-3
+ vld1.32 {q8, q9}, [r9, :128]!
+ vld1.32 {q10, q11}, [r10, :128]!
+ vswp d17, d20
+ vswp d19, d22
+ add r9, r9, ip
+ add r10, r10, ip
+
+ @ matrix 0&2, row 4-5
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
+ vswp d1, d4
+ vswp d3, d6
+ vst1.32 {q0}, [r1, :128]!
+ vst1.32 {q2}, [r5, :128]!
+ vst1.32 {q1}, [r6, :128]!
+ vst1.32 {q3}, [r7, :128]!
+
+ @ matrix 1&3, row 4-5
+ vld1.32 {q12, q13}, [r9, :128]!
+ vld1.32 {q14, q15}, [r10, :128]!
+ vswp d25, d28
+ vswp d27, d30
+ add r9, r9, ip
+ add r10, r10, ip
+
+ @ matrix 0&2, row 6-7
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
+ vswp d1, d4
+ vswp d3, d6
+ vst1.32 {q0}, [r1, :128], r8
+ vst1.32 {q2}, [r5, :128], r8
+ vst1.32 {q1}, [r6, :128], r8
+ vst1.32 {q3}, [r7, :128], r8
@ matrix 1&3, row 6-7
- vld1.32 {q0, q1}, [r0, :128]
- vld1.32 {q2, q3}, [r4, :128]
+ vld1.32 {q0, q1}, [r9, :128]!
+ vld1.32 {q2, q3}, [r10, :128]!
vswp d1, d4
vswp d3, d6
+ @ next row starts right after
+ mov r0, r10
+ add r4, r10, r2
+
@ these could be replaced with VSTM, but that requires swaps
vst1.32 {q4}, [r1, :128]!
vst1.32 {q8}, [r1, :128]!
vst1.32 {q12}, [r1, :128]!
- vst1.32 {q0}, [r1, :128]
+ vst1.32 {q0}, [r1, :128], lr
+
+ vst1.32 {q6}, [r5, :128]!
+ vst1.32 {q10}, [r5, :128]!
+ vst1.32 {q14}, [r5, :128]!
+ vst1.32 {q2}, [r5, :128], lr
- vst1.32 {q6}, [r6, :128]!
- vst1.32 {q10}, [r6, :128]!
- vst1.32 {q14}, [r6, :128]!
- vst1.32 {q2}, [r6, :128]
+ vst1.32 {q5}, [r6, :128]!
+ vst1.32 {q9}, [r6, :128]!
+ vst1.32 {q13}, [r6, :128]!
+ vst1.32 {q1}, [r6, :128], lr
- vst1.32 {q5}, [r7, :128]!
- vst1.32 {q9}, [r7, :128]!
- vst1.32 {q13}, [r7, :128]!
- vst1.32 {q1}, [r7, :128]
+ vst1.32 {q7}, [r7, :128]!
+ vst1.32 {q11}, [r7, :128]!
+ vst1.32 {q15}, [r7, :128]!
+ vst1.32 {q3}, [r7, :128], lr
- vst1.32 {q7}, [r8, :128]!
- vst1.32 {q11}, [r8, :128]!
- vst1.32 {q15}, [r8, :128]!
- vst1.32 {q3}, [r8, :128]
+ @ process all columns
+ bne 1b
vpop {q4-q7}
- pop {r4-r8, pc}
+ pop {r4-r12, pc}
OpenPOWER on IntegriCloud