summaryrefslogtreecommitdiffstats
path: root/sys/arm64
diff options
context:
space:
mode:
authorwma <wma@FreeBSD.org>2016-03-23 13:29:52 +0000
committerwma <wma@FreeBSD.org>2016-03-23 13:29:52 +0000
commit15e11f0a03013ee1bae4d9be9d13810adb32e629 (patch)
treed58de890e82b689c530dffec91bc2605f8cf2ffd /sys/arm64
parent785250e758a6894d88aed2f96c5286dae61b3ca1 (diff)
downloadFreeBSD-src-15e11f0a03013ee1bae4d9be9d13810adb32e629.zip
FreeBSD-src-15e11f0a03013ee1bae4d9be9d13810adb32e629.tar.gz
ARM64 copyinout improvements
The first of set of patches. Use wider load/stores when aligned buffer is being copied. In a simple test: dd if=/dev/zero of=/dev/null bs=1M count=1024 the performance jumped from 410MB/s up to 3.6GB/s. TODO: - better handling of unaligned buffers (WiP) - implement similar mechanism to bzero Submitted by: Dominik Ermel <der@semihalf.com> Obtained from: Semihalf Sponsored by: Cavium Reviewed by: kib, andrew, emaste Differential Revision: https://reviews.freebsd.org/D5664
Diffstat (limited to 'sys/arm64')
-rw-r--r--sys/arm64/arm64/copyinout.S128
1 files changed, 106 insertions, 22 deletions
diff --git a/sys/arm64/arm64/copyinout.S b/sys/arm64/arm64/copyinout.S
index 1ba8106..a146ac9 100644
--- a/sys/arm64/arm64/copyinout.S
+++ b/sys/arm64/arm64/copyinout.S
@@ -51,24 +51,17 @@ END(copyio_fault)
* int copyout(const void *kaddr, void *udaddr, size_t len)
*/
ENTRY(copyout)
- cbz x2, 2f /* If len == 0 then skip loop */
+ cbz x2, 1f
add x3, x1, x2
ldr x4, =VM_MAXUSER_ADDRESS
cmp x3, x4
b.hi copyio_fault_nopcb
- adr x6, copyio_fault /* Get the handler address */
- SET_FAULT_HANDLER(x6, x7) /* Set the handler */
-
-1: ldrb w4, [x0], #1 /* Load from kaddr */
- strb w4, [x1], #1 /* Store in uaddr */
- sub x2, x2, #1 /* len-- */
- cbnz x2, 1b
-
- SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+ b copycommon
-2: mov x0, xzr /* return 0 */
+1: mov x0, xzr /* return 0 */
ret
+
END(copyout)
/*
@@ -77,24 +70,17 @@ END(copyout)
* int copyin(const void *uaddr, void *kdaddr, size_t len)
*/
ENTRY(copyin)
- cbz x2, 2f /* If len == 0 then skip loop */
+ cbz x2, 1f
add x3, x0, x2
ldr x4, =VM_MAXUSER_ADDRESS
cmp x3, x4
b.hi copyio_fault_nopcb
- adr x6, copyio_fault /* Get the handler address */
- SET_FAULT_HANDLER(x6, x7) /* Set the handler */
-
-1: ldrb w4, [x0], #1 /* Load from uaddr */
- strb w4, [x1], #1 /* Store in kaddr */
- sub x2, x2, #1 /* len-- */
- cbnz x2, 1b
-
- SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+ b copycommon
-2: mov x0, xzr /* return 0 */
+1: mov x0, xzr /* return 0 */
ret
+
END(copyin)
/*
@@ -130,3 +116,101 @@ ENTRY(copyinstr)
csel w0, wzr, w1, eq /* If so return success, else failure */
ret
END(copyinstr)
+
+/*
+ * Local helper
+ *
+ * x0 - src pointer
+ * x1 - dst pointer
+ * x2 - size
+ * lr - the return address, so jump here instead of calling
+ *
+ * This function is optimized to minimize concurrent memory accesses. In
+ * present form it is suited for cores with a single memory prefetching
+ * unit.
+ * ARM64TODO:
+ * Consider using separate functions for each ARM64 core. Adding memory
+ * access interleaving might increase a total throughput on A57 or A72.
+ */
+ .text
+ .align 4
+ .local copycommon
+ .type copycommon,@function
+
+copycommon:
+ adr x6, copyio_fault /* Get the handler address */
+ SET_FAULT_HANDLER(x6, x7) /* Set the handler */
+
+
+ /* Check alignment */
+ orr x3, x0, x1
+ ands x3, x3, 0x07
+ b.eq aligned
+
+ /* Unaligned is byte by byte copy */
+byte_by_byte:
+ ldrb w3, [x0], #0x01
+ strb w3, [x1], #0x01
+ subs x2, x2, #0x01
+ b.ne byte_by_byte
+ b ending
+
+aligned:
+ cmp x2, #0x10
+ b.lt lead_out
+ cmp x2, #0x40
+ b.lt by_dwords_start
+
+ /* Block copy */
+ lsr x15, x2, #0x06
+by_blocks:
+ ldp x3, x4, [x0], #0x10
+ ldp x5, x6, [x0], #0x10
+ ldp x7, x8, [x0], #0x10
+ ldp x9, x10, [x0], #0x10
+ stp x3, x4, [x1], #0x10
+ stp x5, x6, [x1], #0x10
+ stp x7, x8, [x1], #0x10
+ stp x9, x10, [x1], #0x10
+
+ subs x15, x15, #0x01
+ b.ne by_blocks
+
+ and x2, x2, #0x3f
+
+by_dwords_start:
+ lsr x15, x2, #0x04
+ cbz x15, lead_out
+by_dwords:
+ ldp x3, x4, [x0], #0x10
+ stp x3, x4, [x1], #0x10
+ subs x15, x15, #0x01
+ b.ne by_dwords
+
+ /* Less than 16 bytes to copy */
+lead_out:
+ tbz x2, #0x03, last_word
+ ldr x3, [x0], #0x08
+ str x3, [x1], #0x08
+
+last_word:
+ tbz x2, #0x02, last_hword
+ ldr w3, [x0], #0x04
+ str w3, [x1], #0x04
+
+last_hword:
+ tbz x2, #0x01, last_byte
+ ldrh w3, [x0], #0x02
+ strh w3, [x1], #0x02
+
+last_byte:
+ tbz x2, #0x00, ending
+ ldrb w3, [x0]
+ strb w3, [x1]
+
+ending:
+ SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+
+ mov x0, xzr /* return 0 */
+ ret
+ .size copycommon, . - copycommon
OpenPOWER on IntegriCloud