powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
a113de373b
commit
15c2d45d17
|
@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
|
|||
|
||||
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
|
||||
usercopy_64.o mem_64.o hweight_64.o \
|
||||
copyuser_power7.o string_64.o copypage_power7.o
|
||||
copyuser_power7.o string_64.o copypage_power7.o \
|
||||
memcmp_64.o
|
||||
ifeq ($(CONFIG_GENERIC_CSUM),)
|
||||
obj-y += checksum_$(CONFIG_WORD_SIZE).o
|
||||
obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
|
||||
|
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* Author: Anton Blanchard <anton@au.ibm.com>
|
||||
* Copyright 2015 IBM Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#include <asm/ppc_asm.h>
|
||||
|
||||
#define off8 r6
|
||||
#define off16 r7
|
||||
#define off24 r8
|
||||
|
||||
#define rA r9
|
||||
#define rB r10
|
||||
#define rC r11
|
||||
#define rD r27
|
||||
#define rE r28
|
||||
#define rF r29
|
||||
#define rG r30
|
||||
#define rH r31
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define LD ldbrx
|
||||
#else
|
||||
#define LD ldx
|
||||
#endif
|
||||
|
||||
_GLOBAL(memcmp)
|
||||
cmpdi cr1,r5,0
|
||||
|
||||
/* Use the short loop if both strings are not 8B aligned */
|
||||
or r6,r3,r4
|
||||
andi. r6,r6,7
|
||||
|
||||
/* Use the short loop if length is less than 32B */
|
||||
cmpdi cr6,r5,31
|
||||
|
||||
beq cr1,.Lzero
|
||||
bne .Lshort
|
||||
bgt cr6,.Llong
|
||||
|
||||
.Lshort:
|
||||
mtctr r5
|
||||
|
||||
1: lbz rA,0(r3)
|
||||
lbz rB,0(r4)
|
||||
subf. rC,rB,rA
|
||||
bne .Lnon_zero
|
||||
bdz .Lzero
|
||||
|
||||
lbz rA,1(r3)
|
||||
lbz rB,1(r4)
|
||||
subf. rC,rB,rA
|
||||
bne .Lnon_zero
|
||||
bdz .Lzero
|
||||
|
||||
lbz rA,2(r3)
|
||||
lbz rB,2(r4)
|
||||
subf. rC,rB,rA
|
||||
bne .Lnon_zero
|
||||
bdz .Lzero
|
||||
|
||||
lbz rA,3(r3)
|
||||
lbz rB,3(r4)
|
||||
subf. rC,rB,rA
|
||||
bne .Lnon_zero
|
||||
|
||||
addi r3,r3,4
|
||||
addi r4,r4,4
|
||||
|
||||
bdnz 1b
|
||||
|
||||
.Lzero:
|
||||
li r3,0
|
||||
blr
|
||||
|
||||
.Lnon_zero:
|
||||
mr r3,rC
|
||||
blr
|
||||
|
||||
.Llong:
|
||||
li off8,8
|
||||
li off16,16
|
||||
li off24,24
|
||||
|
||||
std r31,-8(r1)
|
||||
std r30,-16(r1)
|
||||
std r29,-24(r1)
|
||||
std r28,-32(r1)
|
||||
std r27,-40(r1)
|
||||
|
||||
srdi r0,r5,5
|
||||
mtctr r0
|
||||
andi. r5,r5,31
|
||||
|
||||
LD rA,0,r3
|
||||
LD rB,0,r4
|
||||
|
||||
LD rC,off8,r3
|
||||
LD rD,off8,r4
|
||||
|
||||
LD rE,off16,r3
|
||||
LD rF,off16,r4
|
||||
|
||||
LD rG,off24,r3
|
||||
LD rH,off24,r4
|
||||
cmpld cr0,rA,rB
|
||||
|
||||
addi r3,r3,32
|
||||
addi r4,r4,32
|
||||
|
||||
bdz .Lfirst32
|
||||
|
||||
LD rA,0,r3
|
||||
LD rB,0,r4
|
||||
cmpld cr1,rC,rD
|
||||
|
||||
LD rC,off8,r3
|
||||
LD rD,off8,r4
|
||||
cmpld cr6,rE,rF
|
||||
|
||||
LD rE,off16,r3
|
||||
LD rF,off16,r4
|
||||
cmpld cr7,rG,rH
|
||||
bne cr0,.LcmpAB
|
||||
|
||||
LD rG,off24,r3
|
||||
LD rH,off24,r4
|
||||
cmpld cr0,rA,rB
|
||||
bne cr1,.LcmpCD
|
||||
|
||||
addi r3,r3,32
|
||||
addi r4,r4,32
|
||||
|
||||
bdz .Lsecond32
|
||||
|
||||
.balign 16
|
||||
|
||||
1: LD rA,0,r3
|
||||
LD rB,0,r4
|
||||
cmpld cr1,rC,rD
|
||||
bne cr6,.LcmpEF
|
||||
|
||||
LD rC,off8,r3
|
||||
LD rD,off8,r4
|
||||
cmpld cr6,rE,rF
|
||||
bne cr7,.LcmpGH
|
||||
|
||||
LD rE,off16,r3
|
||||
LD rF,off16,r4
|
||||
cmpld cr7,rG,rH
|
||||
bne cr0,.LcmpAB
|
||||
|
||||
LD rG,off24,r3
|
||||
LD rH,off24,r4
|
||||
cmpld cr0,rA,rB
|
||||
bne cr1,.LcmpCD
|
||||
|
||||
addi r3,r3,32
|
||||
addi r4,r4,32
|
||||
|
||||
bdnz 1b
|
||||
|
||||
.Lsecond32:
|
||||
cmpld cr1,rC,rD
|
||||
bne cr6,.LcmpEF
|
||||
|
||||
cmpld cr6,rE,rF
|
||||
bne cr7,.LcmpGH
|
||||
|
||||
cmpld cr7,rG,rH
|
||||
bne cr0,.LcmpAB
|
||||
|
||||
bne cr1,.LcmpCD
|
||||
bne cr6,.LcmpEF
|
||||
bne cr7,.LcmpGH
|
||||
|
||||
.Ltail:
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
|
||||
cmpdi r5,0
|
||||
beq .Lzero
|
||||
b .Lshort
|
||||
|
||||
.Lfirst32:
|
||||
cmpld cr1,rC,rD
|
||||
cmpld cr6,rE,rF
|
||||
cmpld cr7,rG,rH
|
||||
|
||||
bne cr0,.LcmpAB
|
||||
bne cr1,.LcmpCD
|
||||
bne cr6,.LcmpEF
|
||||
bne cr7,.LcmpGH
|
||||
|
||||
b .Ltail
|
||||
|
||||
.LcmpAB:
|
||||
li r3,1
|
||||
bgt cr0,.Lout
|
||||
li r3,-1
|
||||
b .Lout
|
||||
|
||||
.LcmpCD:
|
||||
li r3,1
|
||||
bgt cr1,.Lout
|
||||
li r3,-1
|
||||
b .Lout
|
||||
|
||||
.LcmpEF:
|
||||
li r3,1
|
||||
bgt cr6,.Lout
|
||||
li r3,-1
|
||||
b .Lout
|
||||
|
||||
.LcmpGH:
|
||||
li r3,1
|
||||
bgt cr7,.Lout
|
||||
li r3,-1
|
||||
|
||||
.Lout:
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
blr
|
|
@ -93,6 +93,7 @@ _GLOBAL(strlen)
|
|||
subf r3,r3,r4
|
||||
blr
|
||||
|
||||
#ifdef CONFIG_PPC32
|
||||
_GLOBAL(memcmp)
|
||||
PPC_LCMPI 0,r5,0
|
||||
beq- 2f
|
||||
|
@ -106,6 +107,7 @@ _GLOBAL(memcmp)
|
|||
blr
|
||||
2: li r3,0
|
||||
blr
|
||||
#endif
|
||||
|
||||
_GLOBAL(memchr)
|
||||
PPC_LCMPI 0,r5,0
|
||||
|
|
Loading…
Reference in New Issue