Correct cacheline size to 32-bytes for ppc405 memset.S (bug 14595).

This patch also creates a version of memset.S for the ppc476 processor
which uses a 128-byte cacheline size for dcbz insns.
This commit is contained in:
Ryan S. Arnold 2012-10-30 17:07:18 -05:00
parent 9f45bfe790
commit 09dec6c37e
4 changed files with 171 additions and 8 deletions

4
NEWS
View File

@ -16,8 +16,8 @@ Version 2.17
14303, 14307, 14328, 14331, 14336, 14337, 14347, 14349, 14376, 14417,
14459, 14476, 14477, 14505, 14510, 14516, 14518, 14519, 14530, 14532,
14538, 14543, 14544, 14545, 14557, 14562, 14568, 14576, 14579, 14583,
14587, 14602, 14621, 14638, 14645, 14648, 14652, 14660, 14661, 14683,
14694, 14716, 14743, 14767, 14783.
14587, 14595, 14602, 14621, 14638, 14645, 14648, 14652, 14660, 14661,
14683, 14694, 14716, 14743, 14767, 14783.
* Support for STT_GNU_IFUNC symbols added for s390 and s390x.
Optimized versions of memcpy, memset, and memcmp added for System z10 and

View File

@ -1,3 +1,12 @@
2012-09-25 Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Ryan S. Arnold <rsa@linux.vnet.ibm.com>
[BZ #14595]
* sysdeps/powerpc/powerpc32/476/memset.S: New file copied from
405/memset.S to preserve 128-byte cacheline size.
* sysdeps/powerpc/powerpc32/405/memset.S (memset): Fix cacheline size
to 32-bytes for 405, 440, and 464 processors.
2012-10-19 Roland McGrath <roland@hack.frob.com>
* sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/nptl/libc.abilist

View File

@ -1,5 +1,5 @@
/* Optimized memset implementation for PowerPC476.
Copyright (C) 2010 Free Software Foundation, Inc.
/* Optimized memset for PowerPC405,440,464 (32-byte cacheline).
Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@ -104,7 +104,7 @@ L(use_dcbz):
add r3,r3,r7
L(skip_string_loop):
clrlwi r8,r6,25
clrlwi r8,r6,27
srwi. r8,r8,4
beq L(dcbz_pre_loop)
mtctr r8
@ -119,14 +119,14 @@ L(word_loop):
bdnz L(word_loop)
L(dcbz_pre_loop):
srwi r6,r5,7
srwi r6,r5,5
mtctr r6
addi r7,0,0
L(dcbz_loop):
dcbz r3,r7
addi r3,r3,0x80
subi r5,r5,0x80
addi r3,r3,0x20
subi r5,r5,0x20
bdnz L(dcbz_loop)
srwi. r6,r5,4
beq L(postword2_count_loop)

View File

@ -0,0 +1,154 @@
/* Optimized memset for PowerPC476 (128-byte cacheline).
Copyright (C) 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>
/* memset
r3:destination address and return address
r4:source integer to copy
r5:byte count
r11:sources integer to copy in all 32 bits of reg
r12:temp return address
Save return address in r12
If destinationn is unaligned and count is greater tha 255 bytes
set 0-3 bytes to make destination aligned
If count is greater tha 255 bytes and setting zero to memory
use dbcz to set memeory when we can
otherwsie do the follwoing
If 16 or more words to set we use 16 word copy loop.
Finaly we set 0-15 extra bytes with string store. */
EALIGN (BP_SYM (memset), 5, 0)
rlwinm r11,r4,0,24,31
rlwimi r11,r4,8,16,23
rlwimi r11,r11,16,0,15
addi r12,r3,0
cmpwi r5,0x00FF
ble L(preword8_count_loop)
cmpwi r4,0x00
beq L(use_dcbz)
neg r6,r3
clrlwi. r6,r6,30
beq L(preword8_count_loop)
addi r8,0,1
mtctr r6
subi r3,r3,1
L(unaligned_bytecopy_loop):
stbu r11,0x1(r3)
subf. r5,r8,r5
beq L(end_memset)
bdnz L(unaligned_bytecopy_loop)
addi r3,r3,1
L(preword8_count_loop):
srwi. r6,r5,4
beq L(preword2_count_loop)
mtctr r6
addi r3,r3,-4
mr r8,r11
mr r9,r11
mr r10,r11
L(word8_count_loop_no_dcbt):
stwu r8,4(r3)
stwu r9,4(r3)
subi r5,r5,0x10
stwu r10,4(r3)
stwu r11,4(r3)
bdnz L(word8_count_loop_no_dcbt)
addi r3,r3,4
L(preword2_count_loop):
clrlwi. r7,r5,28
beq L(end_memset)
mr r8,r11
mr r9,r11
mr r10,r11
mtxer r7
stswx r8,0,r3
L(end_memset):
addi r3,r12,0
blr
L(use_dcbz):
neg r6,r3
clrlwi. r7,r6,28
beq L(skip_string_loop)
mr r8,r11
mr r9,r11
mr r10,r11
subf r5,r7,r5
mtxer r7
stswx r8,0,r3
add r3,r3,r7
L(skip_string_loop):
clrlwi r8,r6,25
srwi. r8,r8,4
beq L(dcbz_pre_loop)
mtctr r8
L(word_loop):
stw r11,0(r3)
subi r5,r5,0x10
stw r11,4(r3)
stw r11,8(r3)
stw r11,12(r3)
addi r3,r3,0x10
bdnz L(word_loop)
L(dcbz_pre_loop):
srwi r6,r5,7
mtctr r6
addi r7,0,0
L(dcbz_loop):
dcbz r3,r7
addi r3,r3,0x80
subi r5,r5,0x80
bdnz L(dcbz_loop)
srwi. r6,r5,4
beq L(postword2_count_loop)
mtctr r6
L(postword8_count_loop):
stw r11,0(r3)
subi r5,r5,0x10
stw r11,4(r3)
stw r11,8(r3)
stw r11,12(r3)
addi r3,r3,0x10
bdnz L(postword8_count_loop)
L(postword2_count_loop):
clrlwi. r7,r5,28
beq L(end_memset)
mr r8,r11
mr r9,r11
mr r10,r11
mtxer r7
stswx r8,0,r3
b L(end_memset)
END (BP_SYM (memset))
libc_hidden_builtin_def (memset)