aarch64: Optimized memchr specific to AmpereComputing emag
This version uses general register based memory instruction to load data, because vector register based is slightly slower in emag. Character-matching is performed on 16-byte (both size and alignment) memory block in parallel each iteration. * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. [!MEMCHR](MEMCHR): Set to __memchr. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memchr_generic and memchr_nosimd. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memchr ifuncs. * sysdeps/aarch64/multiarch/memchr.c: New file. * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
This commit is contained in:
parent
c7d3890ff5
commit
83d1cc42d8
12
ChangeLog
12
ChangeLog
|
@ -1,3 +1,15 @@
|
|||
2019-02-01 Feng Xue <fxue@os.amperecomputing.com>
|
||||
|
||||
* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
|
||||
[!MEMCHR](MEMCHR): Set to __memchr.
|
||||
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
|
||||
Add memchr_generic and memchr_nosimd.
|
||||
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||
(__libc_ifunc_impl_list): Add memchr ifuncs.
|
||||
* sysdeps/aarch64/multiarch/memchr.c: New file.
|
||||
* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
|
||||
* sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
|
||||
|
||||
2019-02-01 Feng Xue <fxue@os.amperecomputing.com>
|
||||
|
||||
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
|
||||
|
|
|
@ -26,6 +26,10 @@
|
|||
* Neon Available.
|
||||
*/
|
||||
|
||||
#ifndef MEMCHR
|
||||
# define MEMCHR __memchr
|
||||
#endif
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
@ -59,7 +63,7 @@
|
|||
* identify exactly which byte has matched.
|
||||
*/
|
||||
|
||||
ENTRY (__memchr)
|
||||
ENTRY (MEMCHR)
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, L(zero_length)
|
||||
/*
|
||||
|
@ -152,6 +156,6 @@ L(tail):
|
|||
L(zero_length):
|
||||
mov result, #0
|
||||
ret
|
||||
END (__memchr)
|
||||
weak_alias (__memchr, memchr)
|
||||
END (MEMCHR)
|
||||
weak_alias (MEMCHR, memchr)
|
||||
libc_hidden_builtin_def (memchr)
|
||||
|
|
|
@ -2,5 +2,6 @@ ifeq ($(subdir),string)
|
|||
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
|
||||
memcpy_falkor memmove_falkor \
|
||||
memset_generic memset_falkor memset_emag \
|
||||
memchr_generic memchr_nosimd \
|
||||
strlen_generic strlen_asimd
|
||||
endif
|
||||
|
|
|
@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
|
||||
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
|
||||
IFUNC_IMPL (i, name, memchr,
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
|
||||
|
||||
IFUNC_IMPL (i, name, strlen,
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/* Multiple versions of memchr. AARCH64 version.
|
||||
Copyright (C) 2018 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Define multiple versions only for the definition in libc. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
/* Redefine memchr so that the compiler won't complain about the type
|
||||
mismatch with the IFUNC selector in strong_alias, below. */
|
||||
# undef memchr
|
||||
# define memchr __redirect_memchr
|
||||
# include <string.h>
|
||||
# include <init-arch.h>
|
||||
|
||||
extern __typeof (__redirect_memchr) __memchr;
|
||||
|
||||
extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
|
||||
extern __typeof (__redirect_memchr) __memchr_nosimd attribute_hidden;
|
||||
|
||||
libc_ifunc (__memchr,
|
||||
((IS_EMAG (midr)
|
||||
? __memchr_nosimd
|
||||
: __memchr_generic)));
|
||||
|
||||
# undef memchr
|
||||
strong_alias (__memchr, memchr);
|
||||
#endif
|
|
@ -0,0 +1,33 @@
|
|||
/* Memchr for aarch64, default version for internal use.
|
||||
Copyright (C) 2018 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
# define MEMCHR __memchr_generic
|
||||
|
||||
/* Do not hide the generic version of memchr, we use it internally. */
|
||||
# undef libc_hidden_builtin_def
|
||||
# define libc_hidden_builtin_def(name)
|
||||
|
||||
/* Add a hidden definition for use within libc.so. */
|
||||
# ifdef SHARED
|
||||
.globl __GI_memchr; __GI_memchr = __memchr_generic
|
||||
# endif
|
||||
#endif
|
||||
|
||||
# include "../memchr.S"
|
|
@ -0,0 +1,223 @@
|
|||
/* memchr - find a character in a memory zone using base integer registers
|
||||
|
||||
Copyright (C) 2018 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Use base integer registers.
|
||||
*/
|
||||
|
||||
#ifndef MEMCHR
|
||||
# define MEMCHR __memchr_nosimd
|
||||
#endif
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin x1
|
||||
#define cntin x2
|
||||
|
||||
#define result x0
|
||||
|
||||
#define repchr x1
|
||||
|
||||
#define tmp1 x2
|
||||
#define tmp2 x3
|
||||
#define tmp3 x4
|
||||
#define tmp4 x5
|
||||
|
||||
#define src x6
|
||||
#define srcend x7
|
||||
#define srcend16 x8
|
||||
|
||||
#define anymore x9
|
||||
|
||||
#define zeroones x10
|
||||
|
||||
#define data1 x11
|
||||
#define data2 x12
|
||||
|
||||
#define has_chr1 x13
|
||||
#define has_chr2 x14
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
|
||||
ENTRY_ALIGN (MEMCHR, 6)
|
||||
|
||||
DELOUSE (0)
|
||||
DELOUSE (2)
|
||||
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, L(none_chr)
|
||||
|
||||
/* Start address is 16-byte aligned or not? */
|
||||
tst srcin, 15
|
||||
bic src, srcin, 15
|
||||
|
||||
mov zeroones, REP8_01
|
||||
and repchr, chrin, 255
|
||||
/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
|
||||
mul repchr, repchr, zeroones
|
||||
|
||||
add srcend, srcin, cntin
|
||||
/*
|
||||
* srcend16 is address of the block following the last block.
|
||||
*
|
||||
* [A block is 16-byte aligned and sized.]
|
||||
*/
|
||||
add srcend16, srcend, 15
|
||||
bic srcend16, srcend16, 15
|
||||
|
||||
b.eq L(loop)
|
||||
|
||||
/* Load the first block containing start address. */
|
||||
ldp data1, data2, [src], 16
|
||||
|
||||
lsl tmp1, srcin, 3
|
||||
mov tmp2, ~0
|
||||
#ifdef __AARCH64EB__
|
||||
lsr tmp3, tmp2, tmp1
|
||||
#else
|
||||
lsl tmp3, tmp2, tmp1
|
||||
#endif
|
||||
/* Start address is in the first or the second qword? */
|
||||
tst srcin, 8
|
||||
|
||||
/*
|
||||
* Transform any byte in the block to zero using XOR operation,
|
||||
* if that byte equals the char to search. In this way, searching
|
||||
* the char becomes detecting zero in the resulting two qwords.
|
||||
*/
|
||||
eor data1, data1, repchr
|
||||
eor data2, data2, repchr
|
||||
|
||||
/*
|
||||
* Set those unused bytes(before start address) to 0xff, so
|
||||
* that they will not hit any zero detection.
|
||||
*/
|
||||
orn tmp1, data1, tmp3
|
||||
orn tmp2, data2, tmp3
|
||||
|
||||
csinv data1, tmp1, xzr, eq
|
||||
csel data2, data2, tmp2, eq
|
||||
|
||||
/*
|
||||
* When the first and last block are the same, there are two cases:
|
||||
* o. Memory range to search is just in one block.
|
||||
* ( start address - end address) < 0
|
||||
*
|
||||
* o. Memory range is so large that end address wrap-around.
|
||||
* ( start address - end address) > 0
|
||||
*/
|
||||
cmp srcin, srcend
|
||||
ccmp src, srcend16, 0, mi
|
||||
csetm anymore, ne
|
||||
b L(find_chr)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ldp data1, data2, [src], 16
|
||||
|
||||
subs anymore, src, srcend16
|
||||
|
||||
/*
|
||||
* Transform any byte in the block to zero using XOR operation,
|
||||
* if that byte equals the char to search.
|
||||
*/
|
||||
eor data1, data1, repchr
|
||||
eor data2, data2, repchr
|
||||
|
||||
L(find_chr):
|
||||
/*
|
||||
* Use the following integer test to find out if any byte in a
|
||||
* qword is zero. If do not contain zero-valued byte, test result
|
||||
* is zero.
|
||||
*
|
||||
* (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
|
||||
* =
|
||||
* (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f)
|
||||
*
|
||||
*/
|
||||
sub tmp1, data1, zeroones
|
||||
sub tmp2, data2, zeroones
|
||||
|
||||
orr tmp3, data1, REP8_7f
|
||||
orr tmp4, data2, REP8_7f
|
||||
|
||||
bic has_chr1, tmp1, tmp3
|
||||
bic has_chr2, tmp2, tmp4
|
||||
|
||||
orr tmp1, has_chr1, has_chr2
|
||||
ccmp tmp1, 0, 0, ne
|
||||
|
||||
b.eq L(loop)
|
||||
|
||||
cbz has_chr1, 1f
|
||||
sub result, src, 16
|
||||
#ifdef __AARCH64EB__
|
||||
rev data1, data1
|
||||
#else
|
||||
rev has_chr1, has_chr1
|
||||
#endif
|
||||
b L(done)
|
||||
|
||||
1: cbz has_chr2, L(none_chr)
|
||||
sub result, src, 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data1, data2
|
||||
#else
|
||||
rev has_chr1, has_chr2
|
||||
#endif
|
||||
|
||||
L(done):
|
||||
#ifdef __AARCH64EB__
|
||||
/*
|
||||
* For big-endian, can not directly use has_chr1/has_chr2 because
|
||||
* two qwords has been reversed after loading from memory.
|
||||
* Thus, have to perform char detection on two qwords again, which
|
||||
* should be byte-swapped this time.
|
||||
*/
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp3, data1, REP8_7f
|
||||
bic has_chr1, tmp1, tmp3
|
||||
rev has_chr1, has_chr1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If the specified char is found in a qword, the corresponding
|
||||
* byte of in has_chr has value of 1, while this is only true for
|
||||
* the first occurrence, not other occurrences.
|
||||
*/
|
||||
cmp anymore, 0
|
||||
clz tmp1, has_chr1
|
||||
add result, result, tmp1, lsr 3
|
||||
ccmp result, srcend, 8, eq /* NZCV = 8000 */
|
||||
csel result, result, xzr, mi
|
||||
ret
|
||||
|
||||
L(none_chr):
|
||||
mov result, 0
|
||||
ret
|
||||
|
||||
END (MEMCHR)
|
||||
libc_hidden_builtin_def (MEMCHR)
|
Loading…
Reference in New Issue