5988765741
The find_first_set and find_last_set method is not optimal for neon, it needs to be improved by synthesized with horizontal adds(vaddv) which will reduce the generated assembly code. In the following cases, vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4 instructions: # vaddvq_s16 vaddvq_s16(__asint); // addv h0, v1.8h // smov w1, v0.h[0] # vpadd_s16 vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0] // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // smov w1, v1.h[0] # libstdc++-v3/ChangeLog: * include/experimental/bits/simd_neon.h: Replace repeated vpadd calls with a single vaddv for aarch64. |
||
---|---|---|
.. | ||
bits | ||
algorithm | ||
any | ||
array | ||
buffer | ||
chrono | ||
deque | ||
executor | ||
filesystem | ||
forward_list | ||
functional | ||
internet | ||
io_context | ||
iterator | ||
list | ||
map | ||
memory | ||
memory_resource | ||
net | ||
netfwd | ||
numeric | ||
optional | ||
propagate_const | ||
random | ||
ratio | ||
regex | ||
set | ||
simd | ||
socket | ||
source_location | ||
string | ||
string_view | ||
system_error | ||
timer | ||
tuple | ||
type_traits | ||
unordered_map | ||
unordered_set | ||
utility | ||
vector |