PowerPC LE strchr

http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html

Adds little-endian support to optimised strchr assembly.  I've also
tweaked the big-endian code a little.  In power7/strchr.S there's a
check in the tail of the function that we didn't match 0 before
finding a c match, done by comparing leading zero counts.  It's just
as valid, and quicker, to compare the raw output from cmpb.

Another little tweak is to use rldimi/insrdi in place of rlwimi for
the power7 strchr functions.  Since rlwimi is cracked, it is a few
cycles slower.  rldimi can be used on the 32-bit power7 functions
too.

	* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
	support.  Correct typos, formatting.  Optimize tail.  Use insrdi
	rather than rlwimi.
	* sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
	* sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
	little-endian support.  Correct typos.
	* sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
	rather than rlwimi.
	* sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
	in loop and entry code to keep "and." results.
	(strchr): Add little-endian support.  Comment.  Move cntlzd
	earlier in tail.
	* sysdeps/powerpc/powerpc32/strchr.S: Likewise.
This commit is contained in:
Alan Modra 2013-08-17 18:46:05 +09:30
parent 43b8401371
commit 664318c3eb
7 changed files with 228 additions and 74 deletions

View File

@ -1,3 +1,19 @@
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
support. Correct typos, formatting. Optimize tail. Use insrdi
rather than rlwimi.
* sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
little-endian support. Correct typos.
* sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise. Use insrdi
rather than rlwimi.
* sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define. Use
in loop and entry code to keep "and." results.
(strchr): Add little-endian support. Comment. Move cntlzd
earlier in tail.
* sysdeps/powerpc/powerpc32/strchr.S: Likewise.
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc64/strcpy.S: Add little-endian support:

View File

@ -35,8 +35,8 @@ ENTRY (strchr)
beq cr7,L(null_match)
/* Replicate byte to word. */
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
insrdi r4,r4,8,48
insrdi r4,r4,16,32
/* Now r4 has a word of c bytes and r0 has
a word of null bytes. */
@ -46,11 +46,17 @@ ENTRY (strchr)
/* Move the words left and right to discard the bits that are
not part of the string and to bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srw r10,r10,r6
srw r11,r11,r6
slw r10,r10,r6
slw r11,r11,r6
#else
slw r10,r10,r6
slw r11,r11,r6
srw r10,r10,r6
srw r11,r11,r6
#endif
or r5,r10,r11 /* OR the results to speed things up. */
cmpwi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@ -65,7 +71,7 @@ ENTRY (strchr)
/* Handle WORD2 of pair. */
lwzu r12,4(r8)
cmpb r10,r12,r4
cmpb r10,r12,r4
cmpb r11,r12,r0
or r5,r10,r11
cmpwi cr7,r5,0
@ -100,22 +106,31 @@ L(loop):
bne cr6,L(done)
/* The c/null byte must be in the second word. Adjust the address
again and move the result of cmpb to r10 so we can calculate the
pointer. */
again and move the result of cmpb to r10/r11 so we can calculate
the pointer. */
mr r10,r6
mr r11,r7
addi r8,r8,4
/* r5 has the output of the cmpb instruction, that is, it contains
/* r10/r11 have the output of the cmpb instructions, that is,
0xff in the same position as the c/null byte in the original
word from the string. Use that to calculate the pointer. */
L(done):
cntlzw r4,r10 /* Count leading zeroes before c matches. */
cntlzw r0,r11 /* Count leading zeroes before null matches. */
cmplw cr7,r4,r0
#ifdef __LITTLE_ENDIAN__
addi r3,r10,-1
andc r3,r3,r10
popcntw r0,r3
addi r4,r11,-1
andc r4,r4,r11
cmplw cr7,r3,r4
bgt cr7,L(no_match)
srwi r0,r4,3 /* Convert leading zeroes to bytes. */
#else
cntlzw r0,r10 /* Count leading zeros before c matches. */
cmplw cr7,r11,r10
bgt cr7,L(no_match)
#endif
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching c byte
or null in case c was not found. */
blr
@ -133,10 +148,14 @@ L(null_match):
cmpb r5,r12,r0 /* Compare each byte against null bytes. */
/* Move the words left and right to discard the bits that are
not part of the string and to bring them back as zeros. */
not part of the string and bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srw r5,r5,r6
slw r5,r5,r6
#else
slw r5,r5,r6
srw r5,r5,r6
#endif
cmpwi cr7,r5,0 /* If r10 == 0, no c or null bytes
have been found. */
bne cr7,L(done_null)
@ -191,7 +210,13 @@ L(loop_null):
0xff in the same position as the null byte in the original
word from the string. Use that to calculate the pointer. */
L(done_null):
#ifdef __LITTLE_ENDIAN__
addi r0,r5,-1
andc r0,r0,r5
popcntw r0,r0
#else
cntlzw r0,r5 /* Count leading zeros before the match. */
#endif
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching null byte. */
blr

View File

@ -27,8 +27,8 @@ ENTRY (__strchrnul)
clrrwi r8,r3,2 /* Align the address to word boundary. */
/* Replicate byte to word. */
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
insrdi r4,r4,8,48
insrdi r4,r4,16,32
rlwinm r6,r3,3,27,28 /* Calculate padding. */
lwz r12,0(r8) /* Load word from memory. */
@ -43,10 +43,17 @@ ENTRY (__strchrnul)
/* Move the words left and right to discard the bits that are
not part of the string and bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srw r10,r10,r6
srw r9,r9,r6
slw r10,r10,r6
slw r9,r9,r6
#else
slw r10,r10,r6
slw r9,r9,r6
srw r10,r10,r6
srw r9,r9,r6
#endif
or r5,r9,r10 /* OR the results to speed things up. */
cmpwi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@ -54,7 +61,7 @@ ENTRY (__strchrnul)
mtcrf 0x01,r8
/* Are we now aligned to a quadword boundary? If so, skip to
/* Are we now aligned to a doubleword boundary? If so, skip to
the main loop. Otherwise, go through the alignment code. */
bt 29,L(loop)
@ -76,7 +83,7 @@ L(loop):
single register for speed. This is an attempt
to speed up the null-checking process for bigger strings. */
lwz r12,4(r8)
lwzu r11,8(r8)
lwzu r11,8(r8)
cmpb r10,r12,r0
cmpb r9,r12,r4
cmpb r6,r11,r0
@ -95,9 +102,9 @@ L(loop):
addi r8,r8,-4
bne cr6,L(done)
/* The c/null byte must be in the second word. Adjust the
address again and move the result of cmpb to r10 so we can calculate
the pointer. */
/* The c/null byte must be in the second word. Adjust the address
again and move the result of cmpb to r5 so we can calculate the
pointer. */
mr r5,r10
addi r8,r8,4
@ -105,7 +112,13 @@ L(loop):
0xff in the same position as the c/null byte in the original
word from the string. Use that to calculate the pointer. */
L(done):
#ifdef __LITTLE_ENDIAN__
addi r0,r5,-1
andc r0,r0,r5
popcntw r0,r0
#else
cntlzw r0,r5 /* Count leading zeros before the match. */
#endif
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of matching c/null byte. */
blr

View File

@ -36,6 +36,8 @@ ENTRY (strchr)
#define rIGN r10 /* number of bits we should ignore in the first word */
#define rMASK r11 /* mask with the bits to ignore set to 0 */
#define rTMP3 r12
#define rTMP4 rIGN
#define rTMP5 rMASK
rlwimi rCHR, rCHR, 8, 16, 23
@ -49,64 +51,93 @@ ENTRY (strchr)
addi r7F7F, r7F7F, 0x7f7f
/* Test the first (partial?) word. */
lwz rWORD, 0(rSTR)
#ifdef __LITTLE_ENDIAN__
slw rMASK, rMASK, rIGN
#else
srw rMASK, rMASK, rIGN
#endif
orc rWORD, rWORD, rMASK
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
and. rTMP1, rTMP1, rTMP2
and. rTMP4, rTMP1, rTMP2
xor rTMP3, rCHR, rWORD
orc rTMP3, rTMP3, rMASK
b L(loopentry)
/* The loop. */
L(loop):lwzu rWORD, 4(rSTR)
and. rTMP1, rTMP1, rTMP2
L(loop):
lwzu rWORD, 4(rSTR)
and. rTMP5, rTMP1, rTMP2
/* Test for 0. */
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
add rTMP1, rFEFE, rWORD /* x - 0x01010101. */
nor rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080. */
bne L(foundit)
and. rTMP1, rTMP1, rTMP2
and. rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080. */
/* Start test for the bytes we're looking for. */
xor rTMP3, rCHR, rWORD
L(loopentry):
add rTMP1, rFEFE, rTMP3
nor rTMP2, r7F7F, rTMP3
beq L(loop)
/* There is a zero byte in the word, but may also be a matching byte (either
before or after the zero byte). In fact, we may be looking for a
zero byte, in which case we return a match. We guess that this hasn't
happened, though. */
L(missed):
and. rTMP1, rTMP1, rTMP2
zero byte, in which case we return a match. */
and. rTMP5, rTMP1, rTMP2
li rRTN, 0
beqlr
/* It did happen. Decide which one was first...
I'm not sure if this is actually faster than a sequence of
rotates, compares, and branches (we use it anyway because it's shorter). */
/* At this point:
rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
But there may be false matches in the next most significant byte from
a true match due to carries. This means we need to recalculate the
matches using a longer method for big-endian. */
#ifdef __LITTLE_ENDIAN__
addi rTMP1, rTMP5, -1
andc rTMP1, rTMP1, rTMP5
cntlzw rCLZB, rTMP1
addi rTMP2, rTMP4, -1
andc rTMP2, rTMP2, rTMP4
cmplw rTMP1, rTMP2
bgtlr
subfic rCLZB, rCLZB, 32-7
#else
/* I think we could reduce this by two instructions by keeping the "nor"
results from the loop for reuse here. See strlen.S tail. Similarly
one instruction could be pruned from L(foundit). */
and rFEFE, r7F7F, rWORD
or rMASK, r7F7F, rWORD
or rTMP5, r7F7F, rWORD
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
or rTMP4, r7F7F, rTMP3
add rFEFE, rFEFE, r7F7F
add rTMP1, rTMP1, r7F7F
nor rWORD, rMASK, rFEFE
nor rTMP2, rIGN, rTMP1
nor rWORD, rTMP5, rFEFE
nor rTMP2, rTMP4, rTMP1
cntlzw rCLZB, rTMP2
cmplw rWORD, rTMP2
bgtlr
cntlzw rCLZB, rTMP2
#endif
srwi rCLZB, rCLZB, 3
add rRTN, rSTR, rCLZB
blr
L(foundit):
#ifdef __LITTLE_ENDIAN__
addi rTMP1, rTMP5, -1
andc rTMP1, rTMP1, rTMP5
cntlzw rCLZB, rTMP1
subfic rCLZB, rCLZB, 32-7-32
srawi rCLZB, rCLZB, 3
#else
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
or rTMP4, r7F7F, rTMP3
add rTMP1, rTMP1, r7F7F
nor rTMP2, rIGN, rTMP1
nor rTMP2, rTMP4, rTMP1
cntlzw rCLZB, rTMP2
subi rSTR, rSTR, 4
srwi rCLZB, rCLZB, 3
#endif
add rRTN, rSTR, rCLZB
blr
END (strchr)

View File

@ -35,8 +35,8 @@ ENTRY (strchr)
beq cr7,L(null_match)
/* Replicate byte to doubleword. */
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
insrdi r4,r4,8,48
insrdi r4,r4,16,32
insrdi r4,r4,32,0
/* Now r4 has a doubleword of c bytes and r0 has
@ -47,11 +47,17 @@ ENTRY (strchr)
/* Move the doublewords left and right to discard the bits that are
not part of the string and bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srd r10,r10,r6
srd r11,r11,r6
sld r10,r10,r6
sld r11,r11,r6
#else
sld r10,r10,r6
sld r11,r11,r6
srd r10,r10,r6
srd r11,r11,r6
#endif
or r5,r10,r11 /* OR the results to speed things up. */
cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@ -108,15 +114,24 @@ L(loop):
mr r11,r7
addi r8,r8,8
/* r5 has the output of the cmpb instruction, that is, it contains
/* r10/r11 have the output of the cmpb instructions, that is,
0xff in the same position as the c/null byte in the original
doubleword from the string. Use that to calculate the pointer. */
L(done):
cntlzd r4,r10 /* Count leading zeroes before c matches. */
cntlzd r0,r11 /* Count leading zeroes before null matches. */
cmpld cr7,r4,r0
#ifdef __LITTLE_ENDIAN__
addi r3,r10,-1
andc r3,r3,r10
popcntd r0,r3
addi r4,r11,-1
andc r4,r4,r11
cmpld cr7,r3,r4
bgt cr7,L(no_match)
srdi r0,r4,3 /* Convert leading zeroes to bytes. */
#else
cntlzd r0,r10 /* Count leading zeros before c matches. */
cmpld cr7,r11,r10
bgt cr7,L(no_match)
#endif
srdi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching c byte
or null in case c was not found. */
blr
@ -135,9 +150,13 @@ L(null_match):
/* Move the doublewords left and right to discard the bits that are
not part of the string and bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srd r5,r5,r6
sld r5,r5,r6
#else
sld r5,r5,r6
srd r5,r5,r6
#endif
cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
have been found. */
bne cr7,L(done_null)
@ -192,7 +211,13 @@ L(loop_null):
0xff in the same position as the null byte in the original
doubleword from the string. Use that to calculate the pointer. */
L(done_null):
#ifdef __LITTLE_ENDIAN__
addi r0,r5,-1
andc r0,r0,r5
popcntd r0,r0
#else
cntlzd r0,r5 /* Count leading zeros before the match. */
#endif
srdi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching null byte. */
blr

View File

@ -27,8 +27,8 @@ ENTRY (__strchrnul)
clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
/* Replicate byte to doubleword. */
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
insrdi r4,r4,8,48
insrdi r4,r4,16,32
insrdi r4,r4,32,0
rlwinm r6,r3,3,26,28 /* Calculate padding. */
@ -44,10 +44,17 @@ ENTRY (__strchrnul)
/* Move the doublewords left and right to discard the bits that are
not part of the string and to bring them back as zeros. */
#ifdef __LITTLE_ENDIAN__
srd r10,r10,r6
srd r9,r9,r6
sld r10,r10,r6
sld r9,r9,r6
#else
sld r10,r10,r6
sld r9,r9,r6
srd r10,r10,r6
srd r9,r9,r6
#endif
or r5,r9,r10 /* OR the results to speed things up. */
cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@ -97,7 +104,7 @@ L(loop):
bne cr6,L(done)
/* The c/null byte must be in the second doubleword. Adjust the
address again and move the result of cmpb to r10 so we can calculate
address again and move the result of cmpb to r5 so we can calculate
the pointer. */
mr r5,r10
addi r8,r8,8
@ -106,7 +113,13 @@ L(loop):
0xff in the same position as the c/null byte in the original
doubleword from the string. Use that to calculate the pointer. */
L(done):
#ifdef __LITTLE_ENDIAN__
addi r0,r5,-1
andc r0,r0,r5
popcntd r0,r0
#else
cntlzd r0,r5 /* Count leading zeros before the match. */
#endif
srdi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of matching c/null byte. */
blr

View File

@ -37,11 +37,13 @@ ENTRY (strchr)
#define rIGN r10 /* number of bits we should ignore in the first word */
#define rMASK r11 /* mask with the bits to ignore set to 0 */
#define rTMP3 r12
#define rTMP4 rIGN
#define rTMP5 rMASK
dcbt 0,rRTN
rlwimi rCHR, rCHR, 8, 16, 23
insrdi rCHR, rCHR, 8, 48
li rMASK, -1
rlwimi rCHR, rCHR, 16, 0, 15
insrdi rCHR, rCHR, 16, 32
rlwinm rIGN, rRTN, 3, 26, 28
insrdi rCHR, rCHR, 32, 0
lis rFEFE, -0x101
@ -54,64 +56,93 @@ ENTRY (strchr)
add rFEFE, rFEFE, rTMP1
/* Test the first (partial?) word. */
ld rWORD, 0(rSTR)
#ifdef __LITTLE_ENDIAN__
sld rMASK, rMASK, rIGN
#else
srd rMASK, rMASK, rIGN
#endif
orc rWORD, rWORD, rMASK
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
and. rTMP1, rTMP1, rTMP2
and. rTMP4, rTMP1, rTMP2
xor rTMP3, rCHR, rWORD
orc rTMP3, rTMP3, rMASK
b L(loopentry)
/* The loop. */
L(loop):ldu rWORD, 8(rSTR)
and. rTMP1, rTMP1, rTMP2
L(loop):
ldu rWORD, 8(rSTR)
and. rTMP5, rTMP1, rTMP2
/* Test for 0. */
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
add rTMP1, rFEFE, rWORD /* x - 0x01010101. */
nor rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080. */
bne L(foundit)
and. rTMP1, rTMP1, rTMP2
and. rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080. */
/* Start test for the bytes we're looking for. */
xor rTMP3, rCHR, rWORD
L(loopentry):
add rTMP1, rFEFE, rTMP3
nor rTMP2, r7F7F, rTMP3
beq L(loop)
/* There is a zero byte in the word, but may also be a matching byte (either
before or after the zero byte). In fact, we may be looking for a
zero byte, in which case we return a match. We guess that this hasn't
happened, though. */
L(missed):
and. rTMP1, rTMP1, rTMP2
zero byte, in which case we return a match. */
and. rTMP5, rTMP1, rTMP2
li rRTN, 0
beqlr
/* It did happen. Decide which one was first...
I'm not sure if this is actually faster than a sequence of
rotates, compares, and branches (we use it anyway because it's shorter). */
/* At this point:
rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
But there may be false matches in the next most significant byte from
a true match due to carries. This means we need to recalculate the
matches using a longer method for big-endian. */
#ifdef __LITTLE_ENDIAN__
addi rTMP1, rTMP5, -1
andc rTMP1, rTMP1, rTMP5
cntlzd rCLZB, rTMP1
addi rTMP2, rTMP4, -1
andc rTMP2, rTMP2, rTMP4
cmpld rTMP1, rTMP2
bgtlr
subfic rCLZB, rCLZB, 64-7
#else
/* I think we could reduce this by two instructions by keeping the "nor"
results from the loop for reuse here. See strlen.S tail. Similarly
one instruction could be pruned from L(foundit). */
and rFEFE, r7F7F, rWORD
or rMASK, r7F7F, rWORD
or rTMP5, r7F7F, rWORD
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
or rTMP4, r7F7F, rTMP3
add rFEFE, rFEFE, r7F7F
add rTMP1, rTMP1, r7F7F
nor rWORD, rMASK, rFEFE
nor rTMP2, rIGN, rTMP1
nor rWORD, rTMP5, rFEFE
nor rTMP2, rTMP4, rTMP1
cntlzd rCLZB, rTMP2
cmpld rWORD, rTMP2
bgtlr
cntlzd rCLZB, rTMP2
#endif
srdi rCLZB, rCLZB, 3
add rRTN, rSTR, rCLZB
blr
L(foundit):
#ifdef __LITTLE_ENDIAN__
addi rTMP1, rTMP5, -1
andc rTMP1, rTMP1, rTMP5
cntlzd rCLZB, rTMP1
subfic rCLZB, rCLZB, 64-7-64
sradi rCLZB, rCLZB, 3
#else
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
or rTMP4, r7F7F, rTMP3
add rTMP1, rTMP1, r7F7F
nor rTMP2, rIGN, rTMP1
nor rTMP2, rTMP4, rTMP1
cntlzd rCLZB, rTMP2
subi rSTR, rSTR, 8
srdi rCLZB, rCLZB, 3
#endif
add rRTN, rSTR, rCLZB
blr
END (strchr)