aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics
Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst2[q]_lane Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst2q_lane intrinsics. gcc/ChangeLog: 2021-07-30 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Delete. (__ST2Q_LANE_FUNC): Delete. (vst2_lane_f16): Use __builtin_memcpy to copy vector structure instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vst2_lane_f32): Likewise. (vst2_lane_f64): Likewise. (vst2_lane_p8): Likewise. (vst2_lane_p16): Likewise. (vst2_lane_p64): Likewise. (vst2_lane_s8): Likewise. (vst2_lane_s16): Likewise. (vst2_lane_s32): Likewise. (vst2_lane_s64): Likewise. (vst2_lane_u8): Likewise. (vst2_lane_u16): Likewise. (vst2_lane_u32): Likewise. (vst2_lane_u64): Likewise. (vst2_lane_bf16): Likewise. (vst2q_lane_f16): Use __builtin_memcpy to copy vector structure instead of using a union. (vst2q_lane_f32): Likewise. (vst2q_lane_f64): Likewise. (vst2q_lane_p8): Likewise. (vst2q_lane_p16): Likewise. (vst2q_lane_p64): Likewise. (vst2q_lane_s8): Likewise. (vst2q_lane_s16): Likewise. (vst2q_lane_s32): Likewise. (vst2q_lane_s64): Likewise. (vst2q_lane_u8): Likewise. (vst2q_lane_u16): Likewise. (vst2q_lane_u32): Likewise. (vst2q_lane_u64): Likewise. (vst2q_lane_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests.
This commit is contained in:
parent
344f879c66
commit
1deb0818f4
@ -9206,84 +9206,355 @@ __STRUCTN (float, 64, 4)
|
||||
#undef __STRUCTN
|
||||
|
||||
|
||||
#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode, \
|
||||
qmode, ptr_mode, funcsuffix, signedtype) \
|
||||
__extension__ extern __inline void \
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
|
||||
vst2_lane_ ## funcsuffix (ptrtype *__ptr, \
|
||||
intype __b, const int __c) \
|
||||
{ \
|
||||
__builtin_aarch64_simd_oi __o; \
|
||||
largetype __temp; \
|
||||
__temp.val[0] \
|
||||
= vcombine_##funcsuffix (__b.val[0], \
|
||||
vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
|
||||
__temp.val[1] \
|
||||
= vcombine_##funcsuffix (__b.val[1], \
|
||||
vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
|
||||
__o = __builtin_aarch64_set_qregoi##qmode (__o, \
|
||||
(signedtype) __temp.val[0], 0); \
|
||||
__o = __builtin_aarch64_set_qregoi##qmode (__o, \
|
||||
(signedtype) __temp.val[1], 1); \
|
||||
__builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
|
||||
__ptr, __o, __c); \
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_f16 (float16_t *__ptr, float16x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
float16x8x2_t __temp;
|
||||
__temp.val[0] = vcombine_f16 (__val.val[0],
|
||||
vcreate_f16 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_f16 (__val.val[1],
|
||||
vcreate_f16 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
|
||||
float16x8_t)
|
||||
__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
|
||||
float32x4_t)
|
||||
__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
|
||||
float64x2_t)
|
||||
__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
|
||||
int8x16_t)
|
||||
__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
|
||||
int16x8_t)
|
||||
__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
|
||||
poly64x2_t)
|
||||
__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
|
||||
int8x16_t)
|
||||
__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
|
||||
int16x8_t)
|
||||
__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
|
||||
int32x4_t)
|
||||
__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
|
||||
int64x2_t)
|
||||
__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
|
||||
int8x16_t)
|
||||
__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
|
||||
int16x8_t)
|
||||
__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
|
||||
int32x4_t)
|
||||
__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
|
||||
int64x2_t)
|
||||
|
||||
#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
|
||||
__extension__ extern __inline void \
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
|
||||
vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
|
||||
intype __b, const int __c) \
|
||||
{ \
|
||||
union { intype __i; \
|
||||
__builtin_aarch64_simd_oi __o; } __temp = { __b }; \
|
||||
__builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
|
||||
__ptr, __temp.__o, __c); \
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_f32 (float32_t *__ptr, float32x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
float32x4x2_t __temp;
|
||||
__temp.val[0] = vcombine_f32 (__val.val[0],
|
||||
vcreate_f32 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_f32 (__val.val[1],
|
||||
vcreate_f32 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
|
||||
__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
|
||||
__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
|
||||
__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
|
||||
__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
|
||||
__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
|
||||
__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
|
||||
__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
|
||||
__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
|
||||
__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
|
||||
__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
|
||||
__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
|
||||
__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
|
||||
__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_f64 (float64_t *__ptr, float64x1x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
float64x2x2_t __temp;
|
||||
__temp.val[0] = vcombine_f64 (__val.val[0],
|
||||
vcreate_f64 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_f64 (__val.val[1],
|
||||
vcreate_f64 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_p8 (poly8_t *__ptr, poly8x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
poly8x16x2_t __temp;
|
||||
__temp.val[0] = vcombine_p8 (__val.val[0],
|
||||
vcreate_p8 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_p8 (__val.val[1],
|
||||
vcreate_p8 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_p16 (poly16_t *__ptr, poly16x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
poly16x8x2_t __temp;
|
||||
__temp.val[0] = vcombine_p16 (__val.val[0],
|
||||
vcreate_p16 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_p16 (__val.val[1],
|
||||
vcreate_p16 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_p64 (poly64_t *__ptr, poly64x1x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
poly64x2x2_t __temp;
|
||||
__temp.val[0] = vcombine_p64 (__val.val[0],
|
||||
vcreate_p64 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_p64 (__val.val[1],
|
||||
vcreate_p64 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_s8 (int8_t *__ptr, int8x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
int8x16x2_t __temp;
|
||||
__temp.val[0] = vcombine_s8 (__val.val[0],
|
||||
vcreate_s8 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_s8 (__val.val[1],
|
||||
vcreate_s8 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_s16 (int16_t *__ptr, int16x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
int16x8x2_t __temp;
|
||||
__temp.val[0] = vcombine_s16 (__val.val[0],
|
||||
vcreate_s16 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_s16 (__val.val[1],
|
||||
vcreate_s16 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_s32 (int32_t *__ptr, int32x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
int32x4x2_t __temp;
|
||||
__temp.val[0] = vcombine_s32 (__val.val[0],
|
||||
vcreate_s32 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_s32 (__val.val[1],
|
||||
vcreate_s32 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_s64 (int64_t *__ptr, int64x1x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
int64x2x2_t __temp;
|
||||
__temp.val[0] = vcombine_s64 (__val.val[0],
|
||||
vcreate_s64 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_s64 (__val.val[1],
|
||||
vcreate_s64 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_u8 (uint8_t *__ptr, uint8x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
uint8x16x2_t __temp;
|
||||
__temp.val[0] = vcombine_u8 (__val.val[0],
|
||||
vcreate_u8 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_u8 (__val.val[1],
|
||||
vcreate_u8 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_u16 (uint16_t *__ptr, uint16x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
uint16x8x2_t __temp;
|
||||
__temp.val[0] = vcombine_u16 (__val.val[0],
|
||||
vcreate_u16 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_u16 (__val.val[1],
|
||||
vcreate_u16 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_u32 (uint32_t *__ptr, uint32x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
uint32x4x2_t __temp;
|
||||
__temp.val[0] = vcombine_u32 (__val.val[0],
|
||||
vcreate_u32 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_u32 (__val.val[1],
|
||||
vcreate_u32 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_u64 (uint64_t *__ptr, uint64x1x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
uint64x2x2_t __temp;
|
||||
__temp.val[0] = vcombine_u64 (__val.val[0],
|
||||
vcreate_u64 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_u64 (__val.val[1],
|
||||
vcreate_u64 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_f16 (float16_t *__ptr, float16x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_f32 (float32_t *__ptr, float32x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_f64 (float64_t *__ptr, float64x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_p8 (poly8_t *__ptr, poly8x16x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_p16 (poly16_t *__ptr, poly16x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_p64 (poly64_t *__ptr, poly64x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_s8 (int8_t *__ptr, int8x16x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_s16 (int16_t *__ptr, int16x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_s32 (int32_t *__ptr, int32x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_s64 (int64_t *__ptr, int64x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_u8 (uint8_t *__ptr, uint8x16x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_u16 (uint16_t *__ptr, uint16x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_u32 (uint32_t *__ptr, uint32x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_u64 (uint64_t *__ptr, uint64x2x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
@ -34334,9 +34605,30 @@ __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
|
||||
v8bf, bf, bf16, bfloat16x8_t)
|
||||
__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
|
||||
|
||||
__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
|
||||
bf16, bfloat16x8_t)
|
||||
__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
bfloat16x8x2_t __temp;
|
||||
__temp.val[0] = vcombine_bf16 (__val.val[0],
|
||||
vcreate_bf16 (__AARCH64_UINT64_C (0)));
|
||||
__temp.val[1] = vcombine_bf16 (__val.val[1],
|
||||
vcreate_bf16 (__AARCH64_UINT64_C (0)));
|
||||
__builtin_memcpy (&__o, &__temp, sizeof (__temp));
|
||||
__builtin_aarch64_st2_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst2q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x2_t __val, const int __lane)
|
||||
{
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
__builtin_memcpy (&__o, &__val, sizeof (__val));
|
||||
__builtin_aarch64_st2_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
|
||||
__lane);
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
@ -34613,7 +34905,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b)
|
||||
#undef __LD3Q_LANE_FUNC
|
||||
#undef __LD4_LANE_FUNC
|
||||
#undef __LD4Q_LANE_FUNC
|
||||
#undef __ST2_LANE_FUNC
|
||||
#undef __ST2Q_LANE_FUNC
|
||||
|
||||
#endif
|
||||
|
@ -161,6 +161,22 @@ TEST_STX_LANE (vst4q_lane, uint64x2x4_t, uint64_t*, u64);
|
||||
TEST_STX_LANE (vst4q_lane, float64x2x4_t, float64_t*, f64);
|
||||
TEST_STX_LANE (vst4q_lane, poly64x2x4_t, poly64_t*, p64);
|
||||
|
||||
TEST_STX_LANE (vst2q_lane, int8x16x2_t, int8_t*, s8);
|
||||
TEST_STX_LANE (vst2q_lane, uint8x16x2_t, uint8_t*, u8);
|
||||
TEST_STX_LANE (vst2q_lane, poly8x16x2_t, poly8_t*, p8);
|
||||
TEST_STX_LANE (vst2q_lane, int16x8x2_t, int16_t*, s16);
|
||||
TEST_STX_LANE (vst2q_lane, uint16x8x2_t, uint16_t*, u16);
|
||||
TEST_STX_LANE (vst2q_lane, poly16x8x2_t, poly16_t*, p16);
|
||||
TEST_STX_LANE (vst2q_lane, float16x8x2_t, float16_t*, f16);
|
||||
TEST_STX_LANE (vst2q_lane, bfloat16x8x2_t, bfloat16_t*, bf16);
|
||||
TEST_STX_LANE (vst2q_lane, int32x4x2_t, int32_t*, s32);
|
||||
TEST_STX_LANE (vst2q_lane, uint32x4x2_t, uint32_t*, u32);
|
||||
TEST_STX_LANE (vst2q_lane, float32x4x2_t, float32_t*, f32);
|
||||
TEST_STX_LANE (vst2q_lane, int64x2x2_t, int64_t*, s64);
|
||||
TEST_STX_LANE (vst2q_lane, uint64x2x2_t, uint64_t*, u64);
|
||||
TEST_STX_LANE (vst2q_lane, float64x2x2_t, float64_t*, f64);
|
||||
TEST_STX_LANE (vst2q_lane, poly64x2x2_t, poly64_t*, p64);
|
||||
|
||||
#define TEST_ST3_LANE(name, tbltype, ptrtype, ts) \
|
||||
void test_ ## name ## _ ## ts (ptrtype a, int8x8_t dummy, tbltype b) \
|
||||
{ \
|
||||
@ -247,5 +263,5 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
|
||||
/* { dg-final { scan-assembler-times "tbx\\t" 18} } */
|
||||
/* { dg-final { scan-assembler-times "st4\\t" 29} } */
|
||||
/* { dg-final { scan-assembler-times "st3\\t" 29} } */
|
||||
/* { dg-final { scan-assembler-times "st2\\t" 14} } */
|
||||
/* { dg-final { scan-assembler-times "st2\\t" 29} } */
|
||||
/* { dg-final { scan-assembler-times "st1\\t" 42} } */
|
||||
|
Loading…
Reference in New Issue
Block a user