aarch64: Optimized memchr specific to AmpereComputing emag

This version uses general register based memory instruction to load
data, because vector register based is slightly slower in emag.

Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.

    * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
    [!MEMCHR](MEMCHR): Set to __memchr.
    * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
    Add memchr_generic and memchr_nosimd.
    * sysdeps/aarch64/multiarch/ifunc-impl-list.c
    (__libc_ifunc_impl_list): Add memchr ifuncs.
    * sysdeps/aarch64/multiarch/memchr.c: New file.
    * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
    * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
This commit is contained in:
Feng Xue 2018-08-09 04:38:03 -04:00
parent c7d3890ff5
commit 83d1cc42d8
7 changed files with 320 additions and 3 deletions

View File

@ -1,3 +1,15 @@
2019-02-01 Feng Xue <fxue@os.amperecomputing.com>
* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
[!MEMCHR](MEMCHR): Set to __memchr.
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memchr_generic and memchr_nosimd.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add memchr ifuncs.
* sysdeps/aarch64/multiarch/memchr.c: New file.
* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
* sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
2019-02-01 Feng Xue <fxue@os.amperecomputing.com>
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):

View File

@ -26,6 +26,10 @@
* Neon Available.
*/
#ifndef MEMCHR
# define MEMCHR __memchr
#endif
/* Arguments and results. */
#define srcin x0
#define chrin w1
@ -59,7 +63,7 @@
* identify exactly which byte has matched.
*/
ENTRY (__memchr)
ENTRY (MEMCHR)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
@ -152,6 +156,6 @@ L(tail):
L(zero_length):
mov result, #0
ret
END (__memchr)
weak_alias (__memchr, memchr)
END (MEMCHR)
weak_alias (MEMCHR, memchr)
libc_hidden_builtin_def (memchr)

View File

@ -2,5 +2,6 @@ ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor \
memset_generic memset_falkor memset_emag \
memchr_generic memchr_nosimd \
strlen_generic strlen_asimd
endif

View File

@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
IFUNC_IMPL (i, name, memchr,
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)

View File

@ -0,0 +1,41 @@
/* Multiple versions of memchr. AARCH64 version.
Copyright (C) 2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
/* Redefine memchr so that the compiler won't complain about the type
mismatch with the IFUNC selector in strong_alias, below. */
# undef memchr
# define memchr __redirect_memchr
# include <string.h>
# include <init-arch.h>
extern __typeof (__redirect_memchr) __memchr;
extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
extern __typeof (__redirect_memchr) __memchr_nosimd attribute_hidden;
libc_ifunc (__memchr,
((IS_EMAG (midr)
? __memchr_nosimd
: __memchr_generic)));
# undef memchr
strong_alias (__memchr, memchr);
#endif

View File

@ -0,0 +1,33 @@
/* Memchr for aarch64, default version for internal use.
Copyright (C) 2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define MEMCHR __memchr_generic
/* Do not hide the generic version of memchr, we use it internally. */
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name)
/* Add a hidden definition for use within libc.so. */
# ifdef SHARED
.globl __GI_memchr; __GI_memchr = __memchr_generic
# endif
#endif
# include "../memchr.S"

View File

@ -0,0 +1,223 @@
/* memchr - find a character in a memory zone using base integer registers
Copyright (C) 2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* Assumptions:
*
* ARMv8-a, AArch64
* Use base integer registers.
*/
#ifndef MEMCHR
# define MEMCHR __memchr_nosimd
#endif
/* Arguments and results. */
#define srcin x0
#define chrin x1
#define cntin x2
#define result x0
#define repchr x1
#define tmp1 x2
#define tmp2 x3
#define tmp3 x4
#define tmp4 x5
#define src x6
#define srcend x7
#define srcend16 x8
#define anymore x9
#define zeroones x10
#define data1 x11
#define data2 x12
#define has_chr1 x13
#define has_chr2 x14
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
ENTRY_ALIGN (MEMCHR, 6)
DELOUSE (0)
DELOUSE (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(none_chr)
/* Start address is 16-byte aligned or not? */
tst srcin, 15
bic src, srcin, 15
mov zeroones, REP8_01
and repchr, chrin, 255
/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
mul repchr, repchr, zeroones
add srcend, srcin, cntin
/*
* srcend16 is address of the block following the last block.
*
* [A block is 16-byte aligned and sized.]
*/
add srcend16, srcend, 15
bic srcend16, srcend16, 15
b.eq L(loop)
/* Load the first block containing start address. */
ldp data1, data2, [src], 16
lsl tmp1, srcin, 3
mov tmp2, ~0
#ifdef __AARCH64EB__
lsr tmp3, tmp2, tmp1
#else
lsl tmp3, tmp2, tmp1
#endif
/* Start address is in the first or the second qword? */
tst srcin, 8
/*
* Transform any byte in the block to zero using XOR operation,
* if that byte equals the char to search. In this way, searching
* the char becomes detecting zero in the resulting two qwords.
*/
eor data1, data1, repchr
eor data2, data2, repchr
/*
* Set those unused bytes(before start address) to 0xff, so
* that they will not hit any zero detection.
*/
orn tmp1, data1, tmp3
orn tmp2, data2, tmp3
csinv data1, tmp1, xzr, eq
csel data2, data2, tmp2, eq
/*
* When the first and last block are the same, there are two cases:
* o. Memory range to search is just in one block.
* ( start address - end address) < 0
*
* o. Memory range is so large that end address wrap-around.
* ( start address - end address) > 0
*/
cmp srcin, srcend
ccmp src, srcend16, 0, mi
csetm anymore, ne
b L(find_chr)
.p2align 4
L(loop):
ldp data1, data2, [src], 16
subs anymore, src, srcend16
/*
* Transform any byte in the block to zero using XOR operation,
* if that byte equals the char to search.
*/
eor data1, data1, repchr
eor data2, data2, repchr
L(find_chr):
/*
* Use the following integer test to find out if any byte in a
* qword is zero. If do not contain zero-valued byte, test result
* is zero.
*
* (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
* =
* (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f)
*
*/
sub tmp1, data1, zeroones
sub tmp2, data2, zeroones
orr tmp3, data1, REP8_7f
orr tmp4, data2, REP8_7f
bic has_chr1, tmp1, tmp3
bic has_chr2, tmp2, tmp4
orr tmp1, has_chr1, has_chr2
ccmp tmp1, 0, 0, ne
b.eq L(loop)
cbz has_chr1, 1f
sub result, src, 16
#ifdef __AARCH64EB__
rev data1, data1
#else
rev has_chr1, has_chr1
#endif
b L(done)
1: cbz has_chr2, L(none_chr)
sub result, src, 8
#ifdef __AARCH64EB__
rev data1, data2
#else
rev has_chr1, has_chr2
#endif
L(done):
#ifdef __AARCH64EB__
/*
* For big-endian, can not directly use has_chr1/has_chr2 because
* two qwords has been reversed after loading from memory.
* Thus, have to perform char detection on two qwords again, which
* should be byte-swapped this time.
*/
sub tmp1, data1, zeroones
orr tmp3, data1, REP8_7f
bic has_chr1, tmp1, tmp3
rev has_chr1, has_chr1
#endif
/*
* If the specified char is found in a qword, the corresponding
* byte of in has_chr has value of 1, while this is only true for
* the first occurrence, not other occurrences.
*/
cmp anymore, 0
clz tmp1, has_chr1
add result, result, tmp1, lsr 3
ccmp result, srcend, 8, eq /* NZCV = 8000 */
csel result, result, xzr, mi
ret
L(none_chr):
mov result, 0
ret
END (MEMCHR)
libc_hidden_builtin_def (MEMCHR)