[AArch64] Improve integer memcpy

Further optimize integer memcpy.  Small cases now include copies up
to 32 bytes.  64-128 byte copies are split into two cases to improve
performance of 64-96 byte copies.  Comments have been rewritten.

(cherry picked from commit 700065132744e0dfa6d4d9142d63f6e3a1934726)
This commit is contained in:
Wilco Dijkstra 2020-03-11 17:15:25 +00:00
parent c969e84e0c
commit be3eaffd5a
1 changed files with 112 additions and 107 deletions

View File

@ -33,11 +33,11 @@
#define A_l x6
#define A_lw w6
#define A_h x7
#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
@ -51,16 +51,6 @@
#define H_h srcend
#define tmp1 x14
/* Copies are split into 3 main cases: small copies of up to 32 bytes,
medium copies of 33..128 bytes which are fully unrolled. Large copies
of more than 128 bytes align the destination and use an unrolled loop
processing 64 bytes per iteration.
In order to share code with memmove, small and medium copies read all
data before writing, allowing any kind of overlap. So small, medium
and large backwards memmoves are handled by falling through into memcpy.
Overlapping large forward memmoves use a loop that copies backwards.
*/
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@ -68,35 +58,76 @@
# define MEMCPY memcpy
#endif
ENTRY_ALIGN (MEMMOVE, 6)
/* This implementation supports both memcpy and memmove and shares most code.
It uses unaligned accesses and branchless sequences to keep the code small,
simple and improve performance.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check in memmove is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per
iteration. The destination pointer is 16-byte aligned to minimize
unaligned accesses. The loop tail is handled by always copying 64 bytes
from the end.
*/
ENTRY_ALIGN (MEMCPY, 6)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
sub tmp1, dstin, src
cmp count, 128
ccmp tmp1, count, 2, hi
b.lo L(move_long)
/* Common case falls through into memcpy. */
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)
ENTRY (MEMCPY)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
cmp count, 32
b.ls L(copy32)
cmp count, 128
b.hi L(copy_long)
cmp count, 32
b.hi L(copy32_128)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
ldr A_lw, [src]
ldr B_lw, [srcend, -4]
str A_lw, [dstin]
str B_lw, [dstend, -4]
ret
/* Copy 0..3 bytes using a branchless sequence. */
L(copy4):
cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1]
L(copy0):
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
@ -110,76 +141,32 @@ ENTRY (MEMCPY)
ret
.p2align 4
/* Small copies: 0..32 bytes. */
L(copy32):
/* 16-32 bytes. */
cmp count, 16
b.lo 1f
ldp A_l, A_h, [src]
ldp B_l, B_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstend, -16]
ret
.p2align 4
1:
/* 8-15 bytes. */
tbz count, 3, 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 4
1:
/* 4-7 bytes. */
tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb A_hw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
2: ret
.p2align 4
/* Copy 65..128 bytes. Copy 64 bytes from the start and
64 bytes from the end. */
/* Copy 65..128 bytes. */
L(copy128):
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
cmp count, 96
b.ls L(copy96)
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48]
L(copy96):
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48]
stp G_l, G_h, [dstend, -64]
stp H_l, H_h, [dstend, -48]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
/* Align DST to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 128 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
.p2align 4
/* Copy more than 128 bytes. */
L(copy_long):
/* Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
@ -188,7 +175,8 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(last64)
b.ls L(copy64_from_end)
L(loop64):
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
@ -201,10 +189,8 @@ L(loop64):
subs count, count, 64
b.hi L(loop64)
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
L(last64):
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
@ -219,20 +205,42 @@ L(last64):
stp C_l, C_h, [dstend, -16]
ret
.p2align 4
L(move_long):
cbz tmp1, 3f
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
ENTRY_ALIGN (MEMMOVE, 4)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
b.hi L(move_long)
cmp count, 32
b.hi L(copy32_128)
/* Align dstend to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 128 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
and tmp1, dstend, 15
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
L(move_long):
/* Only use backward copy if there is an overlap. */
sub tmp1, dstin, src
cbz tmp1, L(copy0)
cmp tmp1, count
b.hs L(copy_long)
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [srcend, -16]
and tmp1, dstend, 15
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
@ -242,10 +250,9 @@ L(move_long):
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
b.ls 2f
b.ls L(copy64_from_start)
nop
1:
L(loop64_backwards):
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
@ -255,12 +262,10 @@ L(move_long):
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi 1b
b.hi L(loop64_backwards)
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the start even if
there is just 1 byte left. */
2:
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
@ -273,7 +278,7 @@ L(move_long):
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
3: ret
ret
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)