This is an optimized memcpy/memmove for AArch64. Copies are split into 3 main

cases: small copies of up to 16 bytes, medium copies of 17..96 bytes which are
fully unrolled.  Large copies of more than 96 bytes align the destination and
use an unrolled loop processing 64 bytes per iteration.  In order to share code
with memmove, small and medium copies read all data before writing, allowing
any kind of overlap.  All memmoves except for the large backwards case fall
into memcpy for optimal performance.  On a random copy test memcpy/memmove are
40% faster on Cortex-A57 and 28% on Cortex-A53.

	* sysdeps/aarch64/memcpy.S (memcpy):
	Rewrite of optimized memcpy and memmove.
	* sysdeps/aarch64/memmove.S (memmove): Remove
	memmove code (merged into memcpy.S).
This commit is contained in:
Wilco Dijkstra 2016-06-20 17:38:13 +01:00
parent aca1daef29
commit b998e16e71
3 changed files with 215 additions and 451 deletions

View File

@ -1,3 +1,10 @@
2016-06-20 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Rewrite of optimized memcpy and memmove.
* sysdeps/aarch64/memmove.S (memmove): Remove
memmove code (merged into memcpy.S).
2016-06-20 Florian Weimer <fweimer@redhat.com>
Consolidate machine-agnostic DTV definitions in <dl-dtv.h>.

View File

@ -16,161 +16,229 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* Assumptions:
*
* ARMv8-a, AArch64
* Unaligned accesses
* ARMv8-a, AArch64, unaligned accesses.
*
*/
#define dstin x0
#define src x1
#define count x2
#define tmp1 x3
#define tmp1w w3
#define tmp2 x4
#define tmp2w w4
#define tmp3 x5
#define tmp3w w5
#define dst x6
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define A_hw w7
#define B_l x8
#define B_h x9
#define C_l x10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l src
#define E_h count
#define F_l srcend
#define F_h dst
#define G_l count
#define G_h dst
#define tmp1 x14
#define A_l x7
#define A_h x8
#define B_l x9
#define B_h x10
#define C_l x11
#define C_h x12
#define D_l x13
#define D_h x14
/* Copies are split into 3 main cases: small copies of up to 16 bytes,
medium copies of 17..96 bytes which are fully unrolled. Large copies
of more than 96 bytes align the destination and use an unrolled loop
processing 64 bytes per iteration.
In order to share code with memmove, small and medium copies read all
data before writing, allowing any kind of overlap. So small, medium
and large backwards memmoves are handled by falling through into memcpy.
Overlapping large forward memmoves use a loop that copies backwards.
*/
#include <sysdep.h>
ENTRY_ALIGN (memmove, 6)
ENTRY_ALIGN (memcpy, 6)
sub tmp1, dstin, src
cmp count, 96
ccmp tmp1, count, 2, hi
b.lo L(move_long)
mov dst, dstin
cmp count, #64
b.ge L(cpy_not_short)
cmp count, #15
b.le L(tail15tiny)
/* Common case falls through into memcpy. */
END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
/* Deal with small copies quickly by dropping straight into the
* exit block. */
L(tail63):
/* Copy up to 48 bytes of data. At this point we only need the
* bottom 6 bits of count to be accurate. */
ands tmp1, count, #0x30
b.eq L(tail15)
add dst, dst, tmp1
add src, src, tmp1
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
ldp A_l, A_h, [src, #-48]
stp A_l, A_h, [dst, #-48]
add srcend, src, count
add dstend, dstin, count
cmp count, 96
b.hi L(copy_long)
cmp count, 16
b.hs L(copy_medium)
/* Small copies: 0..16 bytes. */
L(copy16):
tbz count, 3, 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
1:
ldp A_l, A_h, [src, #-32]
stp A_l, A_h, [dst, #-32]
2:
ldp A_l, A_h, [src, #-16]
stp A_l, A_h, [dst, #-16]
tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
.p2align 4
1:
cbz count, 2f
ldrb A_lw, [src]
tbz count, 1, 1f
ldrh A_hw, [srcend, -2]
strh A_hw, [dstend, -2]
1: strb A_lw, [dstin]
2: ret
L(tail15):
ands count, count, #15
beq 1f
add src, src, count
ldp A_l, A_h, [src, #-16]
add dst, dst, count
stp A_l, A_h, [dst, #-16]
1:
RET
L(tail15tiny):
/* Copy up to 15 bytes of data. Does not assume additional data
being copied. */
tbz count, #3, 1f
ldr tmp1, [src], #8
str tmp1, [dst], #8
1:
tbz count, #2, 1f
ldr tmp1w, [src], #4
str tmp1w, [dst], #4
1:
tbz count, #1, 1f
ldrh tmp1w, [src], #2
strh tmp1w, [dst], #2
1:
tbz count, #0, 1f
ldrb tmp1w, [src]
strb tmp1w, [dst]
1:
RET
L(cpy_not_short):
/* We don't much care about the alignment of DST, but we want SRC
* to be 128-bit (16 byte) aligned so that we don't cross cache line
* boundaries on both loads and stores. */
neg tmp2, src
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
b.eq 2f
sub count, count, tmp2
/* Copy more data than needed; it's faster than jumping
* around copying sub-Quadword quantities. We know that
* it can't overrun. */
.p2align 4
/* Medium copies: 17..96 bytes. */
L(copy_medium):
ldp A_l, A_h, [src]
add src, src, tmp2
stp A_l, A_h, [dst]
add dst, dst, tmp2
/* There may be less than 63 bytes to go now. */
cmp count, #63
b.le L(tail63)
2:
subs count, count, #128
b.ge L(cpy_body_large)
/* Less than 128 bytes to copy, so handle 64 here and then jump
* to the tail. */
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, #16]
ldp C_l, C_h, [src, #32]
ldp D_l, D_h, [src, #48]
stp A_l, A_h, [dst]
stp B_l, B_h, [dst, #16]
stp C_l, C_h, [dst, #32]
stp D_l, D_h, [dst, #48]
tst count, #0x3f
add src, src, #64
add dst, dst, #64
b.ne L(tail63)
RET
/* Critical loop. Start at a new cache line boundary. Assuming
* 64 bytes per line this ensures the entire loop is in one line. */
.p2align 6
L(cpy_body_large):
/* There are at least 128 bytes to copy. */
ldp A_l, A_h, [src, #0]
sub dst, dst, #16 /* Pre-bias. */
ldp B_l, B_h, [src, #16]
ldp C_l, C_h, [src, #32]
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
tbnz count, 6, L(copy96)
ldp D_l, D_h, [srcend, -16]
tbz count, 5, 1f
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32]
1:
stp A_l, A_h, [dst, #16]
ldp A_l, A_h, [src, #16]
stp B_l, B_h, [dst, #32]
ldp B_l, B_h, [src, #32]
stp C_l, C_h, [dst, #48]
ldp C_l, C_h, [src, #48]
stp D_l, D_h, [dst, #64]!
ldp D_l, D_h, [src, #64]!
subs count, count, #64
b.ge 1b
stp A_l, A_h, [dst, #16]
stp B_l, B_h, [dst, #32]
stp C_l, C_h, [dst, #48]
stp D_l, D_h, [dst, #64]
add src, src, #16
add dst, dst, #64 + 16
tst count, #0x3f
b.ne L(tail63)
RET
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [src, 32]
ldp D_l, D_h, [src, 48]
ldp E_l, E_h, [srcend, -32]
ldp F_l, F_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin, 32]
stp D_l, D_h, [dstin, 48]
stp E_l, E_h, [dstend, -32]
stp F_l, F_h, [dstend, -16]
ret
/* Align DST to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 96 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
.p2align 4
L(copy_long):
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls 2f
1:
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [src, 32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [src, 48]
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi 1b
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
2:
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
ret
.p2align 4
L(move_long):
cbz tmp1, 3f
add srcend, src, count
add dstend, dstin, count
/* Align dstend to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 96 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
and tmp1, dstend, 15
ldp D_l, D_h, [srcend, -16]
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
stp D_l, D_h, [dstend, -16]
ldp B_l, B_h, [srcend, -32]
ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
b.ls 2f
nop
1:
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [srcend, -48]
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi 1b
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the start even if
there is just 1 byte left. */
2:
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [src, 16]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [src]
stp D_l, D_h, [dstend, -64]
stp G_l, G_h, [dstin, 48]
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
3: ret
END (memcpy)
libc_hidden_builtin_def (memcpy)

View File

@ -1,312 +1 @@
/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* Assumptions:
*
* ARMv8-a, AArch64
* Unaligned accesses
*/
/* Parameters and result. */
#define dstin x0
#define src x1
#define count x2
#define tmp1 x3
#define tmp1w w3
#define tmp2 x4
#define tmp2w w4
#define tmp3 x5
#define tmp3w w5
#define dst x6
#define A_l x7
#define A_h x8
#define B_l x9
#define B_h x10
#define C_l x11
#define C_h x12
#define D_l x13
#define D_h x14
ENTRY_ALIGN (memmove, 6)
cmp dstin, src
b.lo L(downwards)
add tmp1, src, count
cmp dstin, tmp1
b.hs memcpy /* No overlap. */
/* Upwards move with potential overlap.
* Need to move from the tail backwards. SRC and DST point one
* byte beyond the remaining data to move. */
add dst, dstin, count
add src, src, count
cmp count, #64
b.ge L(mov_not_short_up)
/* Deal with small moves quickly by dropping straight into the
* exit block. */
L(tail63up):
/* Move up to 48 bytes of data. At this point we only need the
* bottom 6 bits of count to be accurate. */
ands tmp1, count, #0x30
b.eq L(tail15up)
sub dst, dst, tmp1
sub src, src, tmp1
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
ldp A_l, A_h, [src, #32]
stp A_l, A_h, [dst, #32]
1:
ldp A_l, A_h, [src, #16]
stp A_l, A_h, [dst, #16]
2:
ldp A_l, A_h, [src]
stp A_l, A_h, [dst]
L(tail15up):
/* Move up to 15 bytes of data. Does not assume additional data
* being moved. */
tbz count, #3, 1f
ldr tmp1, [src, #-8]!
str tmp1, [dst, #-8]!
1:
tbz count, #2, 1f
ldr tmp1w, [src, #-4]!
str tmp1w, [dst, #-4]!
1:
tbz count, #1, 1f
ldrh tmp1w, [src, #-2]!
strh tmp1w, [dst, #-2]!
1:
tbz count, #0, 1f
ldrb tmp1w, [src, #-1]
strb tmp1w, [dst, #-1]
1:
RET
L(mov_not_short_up):
/* We don't much care about the alignment of DST, but we want SRC
* to be 128-bit (16 byte) aligned so that we don't cross cache line
* boundaries on both loads and stores. */
ands tmp2, src, #15 /* Bytes to reach alignment. */
b.eq 2f
sub count, count, tmp2
/* Move enough data to reach alignment; unlike memcpy, we have to
* be aware of the overlap, which means we can't move data twice. */
tbz tmp2, #3, 1f
ldr tmp1, [src, #-8]!
str tmp1, [dst, #-8]!
1:
tbz tmp2, #2, 1f
ldr tmp1w, [src, #-4]!
str tmp1w, [dst, #-4]!
1:
tbz tmp2, #1, 1f
ldrh tmp1w, [src, #-2]!
strh tmp1w, [dst, #-2]!
1:
tbz tmp2, #0, 1f
ldrb tmp1w, [src, #-1]!
strb tmp1w, [dst, #-1]!
1:
/* There may be less than 63 bytes to go now. */
cmp count, #63
b.le L(tail63up)
2:
subs count, count, #128
b.ge L(mov_body_large_up)
/* Less than 128 bytes to move, so handle 64 here and then jump
* to the tail. */
ldp A_l, A_h, [src, #-64]!
ldp B_l, B_h, [src, #16]
ldp C_l, C_h, [src, #32]
ldp D_l, D_h, [src, #48]
stp A_l, A_h, [dst, #-64]!
stp B_l, B_h, [dst, #16]
stp C_l, C_h, [dst, #32]
stp D_l, D_h, [dst, #48]
tst count, #0x3f
b.ne L(tail63up)
RET
/* Critical loop. Start at a new Icache line boundary. Assuming
* 64 bytes per line this ensures the entire loop is in one line. */
.p2align 6
L(mov_body_large_up):
/* There are at least 128 bytes to move. */
ldp A_l, A_h, [src, #-16]
ldp B_l, B_h, [src, #-32]
ldp C_l, C_h, [src, #-48]
ldp D_l, D_h, [src, #-64]!
1:
stp A_l, A_h, [dst, #-16]
ldp A_l, A_h, [src, #-16]
stp B_l, B_h, [dst, #-32]
ldp B_l, B_h, [src, #-32]
stp C_l, C_h, [dst, #-48]
ldp C_l, C_h, [src, #-48]
stp D_l, D_h, [dst, #-64]!
ldp D_l, D_h, [src, #-64]!
subs count, count, #64
b.ge 1b
stp A_l, A_h, [dst, #-16]
stp B_l, B_h, [dst, #-32]
stp C_l, C_h, [dst, #-48]
stp D_l, D_h, [dst, #-64]!
tst count, #0x3f
b.ne L(tail63up)
RET
L(downwards):
/* For a downwards move we can safely use memcpy provided that
* DST is more than 16 bytes away from SRC. */
sub tmp1, src, #16
cmp dstin, tmp1
b.ls memcpy /* May overlap, but not critically. */
mov dst, dstin /* Preserve DSTIN for return value. */
cmp count, #64
b.ge L(mov_not_short_down)
/* Deal with small moves quickly by dropping straight into the
* exit block. */
L(tail63down):
/* Move up to 48 bytes of data. At this point we only need the
* bottom 6 bits of count to be accurate. */
ands tmp1, count, #0x30
b.eq L(tail15down)
add dst, dst, tmp1
add src, src, tmp1
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
ldp A_l, A_h, [src, #-48]
stp A_l, A_h, [dst, #-48]
1:
ldp A_l, A_h, [src, #-32]
stp A_l, A_h, [dst, #-32]
2:
ldp A_l, A_h, [src, #-16]
stp A_l, A_h, [dst, #-16]
L(tail15down):
/* Move up to 15 bytes of data. Does not assume additional data
being moved. */
tbz count, #3, 1f
ldr tmp1, [src], #8
str tmp1, [dst], #8
1:
tbz count, #2, 1f
ldr tmp1w, [src], #4
str tmp1w, [dst], #4
1:
tbz count, #1, 1f
ldrh tmp1w, [src], #2
strh tmp1w, [dst], #2
1:
tbz count, #0, 1f
ldrb tmp1w, [src]
strb tmp1w, [dst]
1:
RET
L(mov_not_short_down):
/* We don't much care about the alignment of DST, but we want SRC
* to be 128-bit (16 byte) aligned so that we don't cross cache line
* boundaries on both loads and stores. */
neg tmp2, src
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
b.eq 2f
sub count, count, tmp2
/* Move enough data to reach alignment; unlike memcpy, we have to
* be aware of the overlap, which means we can't move data twice. */
tbz tmp2, #3, 1f
ldr tmp1, [src], #8
str tmp1, [dst], #8
1:
tbz tmp2, #2, 1f
ldr tmp1w, [src], #4
str tmp1w, [dst], #4
1:
tbz tmp2, #1, 1f
ldrh tmp1w, [src], #2
strh tmp1w, [dst], #2
1:
tbz tmp2, #0, 1f
ldrb tmp1w, [src], #1
strb tmp1w, [dst], #1
1:
/* There may be less than 63 bytes to go now. */
cmp count, #63
b.le L(tail63down)
2:
subs count, count, #128
b.ge L(mov_body_large_down)
/* Less than 128 bytes to move, so handle 64 here and then jump
* to the tail. */
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, #16]
ldp C_l, C_h, [src, #32]
ldp D_l, D_h, [src, #48]
stp A_l, A_h, [dst]
stp B_l, B_h, [dst, #16]
stp C_l, C_h, [dst, #32]
stp D_l, D_h, [dst, #48]
tst count, #0x3f
add src, src, #64
add dst, dst, #64
b.ne L(tail63down)
RET
/* Critical loop. Start at a new cache line boundary. Assuming
* 64 bytes per line this ensures the entire loop is in one line. */
.p2align 6
L(mov_body_large_down):
/* There are at least 128 bytes to move. */
ldp A_l, A_h, [src, #0]
sub dst, dst, #16 /* Pre-bias. */
ldp B_l, B_h, [src, #16]
ldp C_l, C_h, [src, #32]
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
1:
stp A_l, A_h, [dst, #16]
ldp A_l, A_h, [src, #16]
stp B_l, B_h, [dst, #32]
ldp B_l, B_h, [src, #32]
stp C_l, C_h, [dst, #48]
ldp C_l, C_h, [src, #48]
stp D_l, D_h, [dst, #64]!
ldp D_l, D_h, [src, #64]!
subs count, count, #64
b.ge 1b
stp A_l, A_h, [dst, #16]
stp B_l, B_h, [dst, #32]
stp C_l, C_h, [dst, #48]
stp D_l, D_h, [dst, #64]
add src, src, #16
add dst, dst, #64 + 16
tst count, #0x3f
b.ne L(tail63down)
RET
END (memmove)
libc_hidden_builtin_def (memmove)
/* memmove is part of memcpy.S. */