libstdc++: Improve "find_first/last_set" for NEON

The find_first_set and find_last_set method is not optimal for neon, it
needs to be improved by synthesized with horizontal adds(vaddv) which
will reduce the generated assembly code. In the following cases,
vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4
instructions:

 # vaddvq_s16
    vaddvq_s16(__asint);
    //  addv    h0, v1.8h
    //  smov    w1, v0.h[0]
 # vpadd_s16
    vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0]
    // addp v1.8h,v1.8h,v2.8h
    // addp v1.8h,v1.8h,v2.8h
    // addp v1.8h,v1.8h,v2.8h
    // smov    w1, v1.h[0]
 #

libstdc++-v3/ChangeLog:

	* include/experimental/bits/simd_neon.h: Replace repeated vpadd
	calls with a single vaddv for aarch64.
This commit is contained in:
yaozhongxiao 2021-02-03 15:49:30 +00:00 committed by Jonathan Wakely
parent 3de9bd16c9
commit 5988765741

View File

@ -311,8 +311,7 @@ struct _MaskImplNeonMixin
});
__asint &= __bitsel;
#ifdef __aarch64__
return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
__zero)[0];
return vaddvq_s16(__asint);
#else
return vpadd_s16(
vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
@ -328,7 +327,7 @@ struct _MaskImplNeonMixin
});
__asint &= __bitsel;
#ifdef __aarch64__
return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
return vaddvq_s32(__asint);
#else
return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
__zero)[0];
@ -351,8 +350,12 @@ struct _MaskImplNeonMixin
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddv_s8(__asint);
#else
return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
__zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 2)
{
@ -362,12 +365,20 @@ struct _MaskImplNeonMixin
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
#ifdef __aarch64__
return vaddv_s16(__asint);
#else
return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
#endif
}
else if constexpr (sizeof(_Tp) == 4)
{
__asint &= __make_vector<_I>(0x1, 0x2);
#ifdef __aarch64__
return vaddv_s32(__asint);
#else
return vpadd_s32(__asint, __zero)[0];
#endif
}
else
__assert_unreachable<_Tp>();