summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2015-01-21 12:27:38 +1100
committerMichael Ellerman <mpe@ellerman.id.au>2015-01-23 14:02:55 +1100
commit15c2d45d17418cc4a712608c78ff3b5f0583d83b (patch)
tree53e4ee00f5e0b604ee7451ee6e229751043ae0f6
parenta113de373bcb7651196e29a49483c8e24e1e6aa9 (diff)
downloadop-kernel-dev-15c2d45d17418cc4a712608c78ff3b5f0583d83b.zip
op-kernel-dev-15c2d45d17418cc4a712608c78ff3b5f0583d83b.tar.gz
powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/lib/Makefile3
-rw-r--r--arch/powerpc/lib/memcmp_64.S233
-rw-r--r--arch/powerpc/lib/string.S2
3 files changed, 237 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159..5526156 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
usercopy_64.o mem_64.o hweight_64.o \
- copyuser_power7.o string_64.o copypage_power7.o
+ copyuser_power7.o string_64.o copypage_power7.o \
+ memcmp_64.o
ifeq ($(CONFIG_GENERIC_CSUM),)
obj-y += checksum_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 0000000..8953d23
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
+/*
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+
+#define off8 r6
+#define off16 r7
+#define off24 r8
+
+#define rA r9
+#define rB r10
+#define rC r11
+#define rD r27
+#define rE r28
+#define rF r29
+#define rG r30
+#define rH r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD ldbrx
+#else
+#define LD ldx
+#endif
+
+_GLOBAL(memcmp)
+ cmpdi cr1,r5,0
+
+ /* Use the short loop if both strings are not 8B aligned */
+ or r6,r3,r4
+ andi. r6,r6,7
+
+ /* Use the short loop if length is less than 32B */
+ cmpdi cr6,r5,31
+
+ beq cr1,.Lzero
+ bne .Lshort
+ bgt cr6,.Llong
+
+.Lshort:
+ mtctr r5
+
+1: lbz rA,0(r3)
+ lbz rB,0(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,1(r3)
+ lbz rB,1(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,2(r3)
+ lbz rB,2(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,3(r3)
+ lbz rB,3(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+
+ addi r3,r3,4
+ addi r4,r4,4
+
+ bdnz 1b
+
+.Lzero:
+ li r3,0
+ blr
+
+.Lnon_zero:
+ mr r3,rC
+ blr
+
+.Llong:
+ li off8,8
+ li off16,16
+ li off24,24
+
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+
+ srdi r0,r5,5
+ mtctr r0
+ andi. r5,r5,31
+
+ LD rA,0,r3
+ LD rB,0,r4
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lfirst32
+
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lsecond32
+
+ .balign 16
+
+1: LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdnz 1b
+
+.Lsecond32:
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+.Ltail:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lshort
+
+.Lfirst32:
+ cmpld cr1,rC,rD
+ cmpld cr6,rE,rF
+ cmpld cr7,rG,rH
+
+ bne cr0,.LcmpAB
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+ b .Ltail
+
+.LcmpAB:
+ li r3,1
+ bgt cr0,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpCD:
+ li r3,1
+ bgt cr1,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpEF:
+ li r3,1
+ bgt cr6,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpGH:
+ li r3,1
+ bgt cr7,.Lout
+ li r3,-1
+
+.Lout:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a0..c80fb49 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
subf r3,r3,r4
blr
+#ifdef CONFIG_PPC32
_GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq- 2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
blr
2: li r3,0
blr
+#endif
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
OpenPOWER on IntegriCloud