summaryrefslogtreecommitdiffstats
path: root/arch/sh
diff options
context:
space:
mode:
authorStuart Menefy <stuart.menefy@st.com>2007-09-28 12:36:35 +0900
committerPaul Mundt <lethal@linux-sh.org>2007-09-28 12:36:35 +0900
commit023ef184fff6ac2e7cba345708f35536a2a419cb (patch)
treec80cc81eeb473a214fee0d2a0e952448071cf154 /arch/sh
parent24eb17e0813490497f4d5b2fad218bdba402cece (diff)
downloadop-kernel-dev-023ef184fff6ac2e7cba345708f35536a2a419cb.zip
op-kernel-dev-023ef184fff6ac2e7cba345708f35536a2a419cb.tar.gz
sh: __copy_user() optimizations for small copies.
This implements a fast-path for small (less than 12 bytes) copies, with the existing path treated as the slow-path and left as the default behaviour for all other copy sizes. Signed-off-by: Stuart Menefy <stuart.menefy@st.com> Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/sh')
-rw-r--r--arch/sh/mm/copy_page.S169
1 files changed, 108 insertions, 61 deletions
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S
index ae039f2..a81dbdb 100644
--- a/arch/sh/mm/copy_page.S
+++ b/arch/sh/mm/copy_page.S
@@ -141,47 +141,38 @@ ENTRY(__copy_user_page)
.long 9999b, 6000f ; \
.previous
ENTRY(__copy_user)
- tst r6,r6 ! Check explicitly for zero
- bf 1f
- rts
- mov #0,r0 ! normal return
-1:
- mov.l r10,@-r15
- mov.l r9,@-r15
- mov.l r8,@-r15
+ ! Check if small number of bytes
+ mov #11,r0
mov r4,r3
- add r6,r3 ! last destination address
- mov #12,r0 ! Check if small number of bytes
- cmp/gt r0,r6
- bt 2f
- bra .L_cleanup_loop
- nop
-2:
- neg r5,r0 ! Calculate bytes needed to align source
+ cmp/gt r0,r6 ! r6 (len) > r0 (11)
+ bf/s .L_cleanup_loop_no_pop
+ add r6,r3 ! last destination address
+
+ ! Calculate bytes needed to align to src
+ mov.l r11,@-r15
+ neg r5,r0
+ mov.l r10,@-r15
add #4,r0
+ mov.l r9,@-r15
and #3,r0
+ mov.l r8,@-r15
tst r0,r0
- bt .L_jump
- mov r0,r1
+ bt 2f
-.L_loop1:
- ! Copy bytes to align source
-EX( mov.b @r5+,r0 )
- dt r1
-EX( mov.b r0,@r4 )
+1:
+ ! Copy bytes to long word align src
+EX( mov.b @r5+,r1 )
+ dt r0
add #-1,r6
- bf/s .L_loop1
+EX( mov.b r1,@r4 )
+ bf/s 1b
add #1,r4
-.L_jump:
- mov r6,r2 ! Calculate number of longwords to copy
+ ! Jump to appropriate routine depending on dest
+2: mov #3,r1
+ mov r6, r2
+ and r4,r1
shlr2 r2
- tst r2,r2
- bt .L_cleanup
-
- mov r4,r0 ! Jump to appropriate routine
- and #3,r0
- mov r0,r1
shll2 r1
mova .L_jump_tbl,r0
mov.l @(r0,r1),r1
@@ -195,43 +186,97 @@ EX( mov.b r0,@r4 )
.long .L_dest10
.long .L_dest11
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+ tst r6,r6 ! Check explicitly for zero
+ bt 1f
+
+2:
+EX( mov.b @r5+,r0 )
+ dt r6
+EX( mov.b r0,@r4 )
+ bf/s 2b
+ add #1,r4
+
+1: mov #0,r0 ! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+ mov.l 8000f,r1
+ mov r3,r0
+ jmp @r1
+ sub r4,r0
+ .align 2
+8000: .long 5000b
+
+.previous
+ rts
+ nop
+
! Destination = 00
.L_dest00:
- mov r2,r7
- shlr2 r7
- shlr r7
- tst r7,r7
- mov #7,r0
- bt/s 1f
- and r0,r2
- .align 2
+ ! Skip the large copy for small transfers
+ mov #(32+32-4), r0
+ cmp/gt r6, r0 ! r0 (60) > r6 (len)
+ bt 1f
+
+ ! Align dest to a 32 byte boundary
+ neg r4,r0
+ add #0x20, r0
+ and #0x1f, r0
+ tst r0, r0
+ bt 2f
+
+ sub r0, r6
+ shlr2 r0
+3:
+EX( mov.l @r5+,r1 )
+ dt r0
+EX( mov.l r1,@r4 )
+ bf/s 3b
+ add #4,r4
+
2:
EX( mov.l @r5+,r0 )
+EX( mov.l @r5+,r1 )
+EX( mov.l @r5+,r2 )
+EX( mov.l @r5+,r7 )
EX( mov.l @r5+,r8 )
EX( mov.l @r5+,r9 )
EX( mov.l @r5+,r10 )
-EX( mov.l r0,@r4 )
-EX( mov.l r8,@(4,r4) )
-EX( mov.l r9,@(8,r4) )
-EX( mov.l r10,@(12,r4) )
-EX( mov.l @r5+,r0 )
-EX( mov.l @r5+,r8 )
-EX( mov.l @r5+,r9 )
-EX( mov.l @r5+,r10 )
- dt r7
-EX( mov.l r0,@(16,r4) )
-EX( mov.l r8,@(20,r4) )
-EX( mov.l r9,@(24,r4) )
-EX( mov.l r10,@(28,r4) )
+EX( mov.l @r5+,r11 )
+EX( movca.l r0,@r4 )
+ add #-32, r6
+EX( mov.l r1,@(4,r4) )
+ mov #32, r0
+EX( mov.l r2,@(8,r4) )
+ cmp/gt r6, r0 ! r0 (32) > r6 (len)
+EX( mov.l r7,@(12,r4) )
+EX( mov.l r8,@(16,r4) )
+EX( mov.l r9,@(20,r4) )
+EX( mov.l r10,@(24,r4) )
+EX( mov.l r11,@(28,r4) )
bf/s 2b
add #32,r4
- tst r2,r2
+
+1: mov r6, r0
+ shlr2 r0
+ tst r0, r0
bt .L_cleanup
1:
-EX( mov.l @r5+,r0 )
- dt r2
-EX( mov.l r0,@r4 )
+EX( mov.l @r5+,r1 )
+ dt r0
+EX( mov.l r1,@r4 )
bf/s 1b
add #4,r4
@@ -250,7 +295,7 @@ EX( mov.l r0,@r4 )
and r0,r2
2:
dt r7
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.l @r5+,r0 )
EX( mov.l @r5+,r1 )
EX( mov.l @r5+,r8 )
@@ -320,7 +365,7 @@ EX( mov.w r0,@(2,r4) )
1: ! Read longword, write two words per iteration
EX( mov.l @r5+,r0 )
dt r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.w r0,@r4 )
shlr16 r0
EX( mov.w r0,@(2,r4) )
@@ -342,7 +387,7 @@ EX( mov.w r0,@r4 )
! Read longword, write byte, word, byte per iteration
EX( mov.l @r5+,r0 )
dt r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX( mov.b r0,@r4 )
shlr8 r0
add #1,r4
@@ -379,6 +424,7 @@ EX( mov.b r0,@r4 )
.L_exit:
mov #0,r0 ! normal return
+
5000:
# Exception handler:
@@ -394,5 +440,6 @@ EX( mov.b r0,@r4 )
.previous
mov.l @r15+,r8
mov.l @r15+,r9
+ mov.l @r15+,r10
rts
- mov.l @r15+,r10
+ mov.l @r15+,r11
OpenPOWER on IntegriCloud