183 lines
4.9 KiB
ArmAsm
183 lines
4.9 KiB
ArmAsm
/* memset.S: optimised assembly memset
|
|
*
|
|
* Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
|
|
.text
|
|
.p2align 4
|
|
|
|
###############################################################################
|
|
#
|
|
# void *memset(void *p, char ch, size_t count)
|
|
#
|
|
# - NOTE: must not use any stack. exception detection performs function return
|
|
# to caller's fixup routine, aborting the remainder of the set
|
|
# GR4, GR7, GR8, and GR11 must be managed
|
|
#
|
|
###############################################################################
|
|
.globl memset,__memset_end
|
|
.type memset,@function
|
|
memset:
|
|
orcc.p gr10,gr0,gr5,icc3 ; GR5 = count
|
|
andi gr9,#0xff,gr9
|
|
or.p gr8,gr0,gr4 ; GR4 = address
|
|
beqlr icc3,#0
|
|
|
|
# conditionally write a byte to 2b-align the address
|
|
setlos.p #1,gr6
|
|
andicc gr4,#1,gr0,icc0
|
|
ckne icc0,cc7
|
|
cstb.p gr9,@(gr4,gr0) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cadd.p gr4,gr6,gr4 ,cc7,#1
|
|
beqlr icc3,#0
|
|
|
|
# conditionally write a word to 4b-align the address
|
|
andicc.p gr4,#2,gr0,icc0
|
|
subicc gr5,#2,gr0,icc1
|
|
setlos.p #2,gr6
|
|
ckne icc0,cc7
|
|
slli.p gr9,#8,gr12 ; need to double up the pattern
|
|
cknc icc1,cc5
|
|
or.p gr9,gr12,gr12
|
|
andcr cc7,cc5,cc7
|
|
|
|
csth.p gr12,@(gr4,gr0) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cadd.p gr4,gr6,gr4 ,cc7,#1
|
|
beqlr icc3,#0
|
|
|
|
# conditionally write a dword to 8b-align the address
|
|
andicc.p gr4,#4,gr0,icc0
|
|
subicc gr5,#4,gr0,icc1
|
|
setlos.p #4,gr6
|
|
ckne icc0,cc7
|
|
slli.p gr12,#16,gr13 ; need to quadruple-up the pattern
|
|
cknc icc1,cc5
|
|
or.p gr13,gr12,gr12
|
|
andcr cc7,cc5,cc7
|
|
|
|
cst.p gr12,@(gr4,gr0) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cadd.p gr4,gr6,gr4 ,cc7,#1
|
|
beqlr icc3,#0
|
|
|
|
or.p gr12,gr12,gr13 ; need to octuple-up the pattern
|
|
|
|
# the address is now 8b-aligned - loop around writing 64b chunks
|
|
setlos #8,gr7
|
|
subi.p gr4,#8,gr4 ; store with update index does weird stuff
|
|
setlos #64,gr6
|
|
|
|
subicc gr5,#64,gr0,icc0
|
|
0: cknc icc0,cc7
|
|
cstdu gr12,@(gr4,gr7) ,cc7,#1
|
|
cstdu gr12,@(gr4,gr7) ,cc7,#1
|
|
cstdu gr12,@(gr4,gr7) ,cc7,#1
|
|
cstdu gr12,@(gr4,gr7) ,cc7,#1
|
|
cstdu gr12,@(gr4,gr7) ,cc7,#1
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
subicc gr5,#64,gr0,icc0
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
beqlr icc3,#0
|
|
bnc icc0,#2,0b
|
|
|
|
# now do 32-byte remnant
|
|
subicc.p gr5,#32,gr0,icc0
|
|
setlos #32,gr6
|
|
cknc icc0,cc7
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
setlos #16,gr6
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
subicc gr5,#16,gr0,icc0
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
beqlr icc3,#0
|
|
|
|
# now do 16-byte remnant
|
|
cknc icc0,cc7
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr6,gr5 ,cc7,#1 ; also set ICC3
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
beqlr icc3,#0
|
|
|
|
# now do 8-byte remnant
|
|
subicc gr5,#8,gr0,icc1
|
|
cknc icc1,cc7
|
|
cstdu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
|
|
setlos.p #4,gr7
|
|
beqlr icc3,#0
|
|
|
|
# now do 4-byte remnant
|
|
subicc gr5,#4,gr0,icc0
|
|
addi.p gr4,#4,gr4
|
|
cknc icc0,cc7
|
|
cstu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
|
|
subicc.p gr5,#2,gr0,icc1
|
|
beqlr icc3,#0
|
|
|
|
# now do 2-byte remnant
|
|
setlos #2,gr7
|
|
addi.p gr4,#2,gr4
|
|
cknc icc1,cc7
|
|
csthu.p gr12,@(gr4,gr7) ,cc7,#1
|
|
csubcc gr5,gr7,gr5 ,cc7,#1 ; also set ICC3
|
|
subicc.p gr5,#1,gr0,icc0
|
|
beqlr icc3,#0
|
|
|
|
# now do 1-byte remnant
|
|
setlos #0,gr7
|
|
addi.p gr4,#2,gr4
|
|
cknc icc0,cc7
|
|
cstb.p gr12,@(gr4,gr0) ,cc7,#1
|
|
bralr
|
|
__memset_end:
|
|
|
|
.size memset, __memset_end-memset
|
|
|
|
###############################################################################
|
|
#
|
|
# clear memory in userspace
|
|
# - return the number of bytes that could not be cleared (0 on complete success)
|
|
#
|
|
# long __memset_user(void *p, size_t count)
|
|
#
|
|
###############################################################################
|
|
.globl __memset_user, __memset_user_error_lr, __memset_user_error_handler
|
|
.type __memset_user,@function
|
|
__memset_user:
|
|
movsg lr,gr11
|
|
|
|
# abuse memset to do the dirty work
|
|
or.p gr9,gr9,gr10
|
|
setlos #0,gr9
|
|
call memset
|
|
__memset_user_error_lr:
|
|
jmpl.p @(gr11,gr0)
|
|
setlos #0,gr8
|
|
|
|
# deal any exception generated by memset
|
|
# GR4 - memset's address tracking pointer
|
|
# GR7 - memset's step value (index register for store insns)
|
|
# GR8 - memset's original start address
|
|
# GR10 - memset's original count
|
|
__memset_user_error_handler:
|
|
add.p gr4,gr7,gr4
|
|
add gr8,gr10,gr8
|
|
jmpl.p @(gr11,gr0)
|
|
sub gr8,gr4,gr8 ; we return the amount left uncleared
|
|
|
|
.size __memset_user, .-__memset_user
|