|
|
|
@ -152,6 +152,16 @@ typedef struct uint64x2x2_t
|
|
|
|
|
uint64x2_t val[2];
|
|
|
|
|
} uint64x2x2_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x4x2_t
|
|
|
|
|
{
|
|
|
|
|
float16x4_t val[2];
|
|
|
|
|
} float16x4x2_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x8x2_t
|
|
|
|
|
{
|
|
|
|
|
float16x8_t val[2];
|
|
|
|
|
} float16x8x2_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float32x2x2_t
|
|
|
|
|
{
|
|
|
|
|
float32x2_t val[2];
|
|
|
|
@ -272,6 +282,16 @@ typedef struct uint64x2x3_t
|
|
|
|
|
uint64x2_t val[3];
|
|
|
|
|
} uint64x2x3_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x4x3_t
|
|
|
|
|
{
|
|
|
|
|
float16x4_t val[3];
|
|
|
|
|
} float16x4x3_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x8x3_t
|
|
|
|
|
{
|
|
|
|
|
float16x8_t val[3];
|
|
|
|
|
} float16x8x3_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float32x2x3_t
|
|
|
|
|
{
|
|
|
|
|
float32x2_t val[3];
|
|
|
|
@ -392,6 +412,16 @@ typedef struct uint64x2x4_t
|
|
|
|
|
uint64x2_t val[4];
|
|
|
|
|
} uint64x2x4_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x4x4_t
|
|
|
|
|
{
|
|
|
|
|
float16x4_t val[4];
|
|
|
|
|
} float16x4x4_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float16x8x4_t
|
|
|
|
|
{
|
|
|
|
|
float16x8_t val[4];
|
|
|
|
|
} float16x8x4_t;
|
|
|
|
|
|
|
|
|
|
typedef struct float32x2x4_t
|
|
|
|
|
{
|
|
|
|
|
float32x2_t val[4];
|
|
|
|
@ -2643,6 +2673,12 @@ vcreate_s64 (uint64_t __a)
|
|
|
|
|
return (int64x1_t) {__a};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vcreate_f16 (uint64_t __a)
|
|
|
|
|
{
|
|
|
|
|
return (float16x4_t) __a;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vcreate_f32 (uint64_t __a)
|
|
|
|
|
{
|
|
|
|
@ -4779,6 +4815,12 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b)
|
|
|
|
|
return __builtin_aarch64_combinedi (__a[0], __b[0]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
|
|
|
|
|
vcombine_f16 (float16x4_t __a, float16x4_t __b)
|
|
|
|
|
{
|
|
|
|
|
return __builtin_aarch64_combinev4hf (__a, __b);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vcombine_f32 (float32x2_t __a, float32x2_t __b)
|
|
|
|
|
{
|
|
|
|
@ -9907,7 +9949,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|uint | Y | Y | N | N |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|float | - | - | N | N |
|
|
|
|
|
|float | - | Y | N | N |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|poly | Y | Y | - | - |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
@ -9921,7 +9963,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|uint | Y | Y | Y | Y |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|float | - | - | Y | Y |
|
|
|
|
|
|float | - | Y | Y | Y |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|poly | Y | Y | - | - |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
@ -9935,7 +9977,7 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|uint | Y | N | N | Y |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|float | - | - | N | Y |
|
|
|
|
|
|float | - | N | N | Y |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
|
|poly | Y | N | - | - |
|
|
|
|
|
+------+----+----+----+----+
|
|
|
|
@ -9951,6 +9993,7 @@ __STRUCTN (int, 8, 2)
|
|
|
|
|
__STRUCTN (int, 16, 2)
|
|
|
|
|
__STRUCTN (uint, 8, 2)
|
|
|
|
|
__STRUCTN (uint, 16, 2)
|
|
|
|
|
__STRUCTN (float, 16, 2)
|
|
|
|
|
__STRUCTN (poly, 8, 2)
|
|
|
|
|
__STRUCTN (poly, 16, 2)
|
|
|
|
|
/* 3-element structs. */
|
|
|
|
@ -9962,6 +10005,7 @@ __STRUCTN (uint, 8, 3)
|
|
|
|
|
__STRUCTN (uint, 16, 3)
|
|
|
|
|
__STRUCTN (uint, 32, 3)
|
|
|
|
|
__STRUCTN (uint, 64, 3)
|
|
|
|
|
__STRUCTN (float, 16, 3)
|
|
|
|
|
__STRUCTN (float, 32, 3)
|
|
|
|
|
__STRUCTN (float, 64, 3)
|
|
|
|
|
__STRUCTN (poly, 8, 3)
|
|
|
|
@ -9999,6 +10043,8 @@ vst2_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
|
|
|
|
|
float16x8_t)
|
|
|
|
|
__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
|
|
|
|
|
float32x4_t)
|
|
|
|
|
__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
|
|
|
|
@ -10037,6 +10083,7 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __temp.__o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
|
|
|
|
|
__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -10078,6 +10125,8 @@ vst3_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
|
|
|
|
|
float16x8_t)
|
|
|
|
|
__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
|
|
|
|
|
float32x4_t)
|
|
|
|
|
__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
|
|
|
|
@ -10116,6 +10165,7 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __temp.__o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
|
|
|
|
|
__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -10162,6 +10212,8 @@ vst4_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
|
|
|
|
|
float16x8_t)
|
|
|
|
|
__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
|
|
|
|
|
float32x4_t)
|
|
|
|
|
__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
|
|
|
|
@ -10200,6 +10252,7 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \
|
|
|
|
|
__ptr, __temp.__o, __c); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
|
|
|
|
|
__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -15255,6 +15308,17 @@ vld2_u32 (const uint32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x2_t ret;
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld2v4hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -15376,6 +15440,17 @@ vld2q_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2q_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x2_t ret;
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld2v8hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2q_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -15530,6 +15605,18 @@ vld3_u32 (const uint32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x3_t ret;
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
__o = __builtin_aarch64_ld3v4hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
|
|
|
|
|
ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -15662,6 +15749,18 @@ vld3q_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3q_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x3_t ret;
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
__o = __builtin_aarch64_ld3v8hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
|
|
|
|
|
ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3q_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -15829,6 +15928,19 @@ vld4_u32 (const uint32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x4_t ret;
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld4v4hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
|
|
|
|
|
ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
|
|
|
|
|
ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -15972,6 +16084,19 @@ vld4q_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4q_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x4_t ret;
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld4v8hf (__a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
|
|
|
|
|
ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
|
|
|
|
|
ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4q_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16033,6 +16158,17 @@ vld2_dup_s32 (const int32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x2_t ret;
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16242,6 +16378,17 @@ vld2q_dup_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2q_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x2_t ret;
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld2q_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16396,6 +16543,18 @@ vld3_dup_u32 (const uint32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x3_t ret;
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
__o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
|
|
|
|
|
ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16528,6 +16687,18 @@ vld3q_dup_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3q_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x3_t ret;
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
__o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
|
|
|
|
|
ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld3q_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16695,6 +16866,19 @@ vld4_dup_u32 (const uint32_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x4x4_t ret;
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
|
|
|
|
|
ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
|
|
|
|
|
ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
|
|
|
|
|
ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16838,6 +17022,19 @@ vld4q_dup_u64 (const uint64_t * __a)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4q_dup_f16 (const float16_t * __a)
|
|
|
|
|
{
|
|
|
|
|
float16x8x4_t ret;
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
__o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
|
|
|
|
|
ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
|
|
|
|
|
ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
|
|
|
|
|
ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
|
|
|
|
|
ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
|
|
|
|
|
vld4q_dup_f32 (const float32_t * __a)
|
|
|
|
|
{
|
|
|
|
@ -16890,6 +17087,8 @@ vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
return __b; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
|
|
|
|
|
v8hf, hf, f16, float16x8_t)
|
|
|
|
|
__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
|
|
|
|
|
sf, f32, float32x4_t)
|
|
|
|
|
__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
|
|
|
|
@ -16934,6 +17133,7 @@ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
return ret; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
|
|
|
|
|
__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -16981,6 +17181,8 @@ vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
return __b; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
|
|
|
|
|
v8hf, hf, f16, float16x8_t)
|
|
|
|
|
__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
|
|
|
|
|
sf, f32, float32x4_t)
|
|
|
|
|
__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
|
|
|
|
@ -17027,6 +17229,7 @@ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
return ret; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
|
|
|
|
|
__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -17082,6 +17285,8 @@ vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
|
|
|
|
|
/* vld4q_lane */
|
|
|
|
|
|
|
|
|
|
__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
|
|
|
|
|
v8hf, hf, f16, float16x8_t)
|
|
|
|
|
__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
|
|
|
|
|
sf, f32, float32x4_t)
|
|
|
|
|
__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
|
|
|
|
@ -17130,6 +17335,7 @@ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
|
|
|
|
|
return ret; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
|
|
|
|
|
__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
|
|
|
|
|
__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
|
|
|
|
|
__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
|
|
|
|
@ -22490,6 +22696,18 @@ vst2_u32 (uint32_t * __a, uint32x2x2_t val)
|
|
|
|
|
__builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst2_f16 (float16_t * __a, float16x4x2_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
float16x8x2_t temp;
|
|
|
|
|
temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
__o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
|
|
|
|
|
__builtin_aarch64_st2v4hf (__a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst2_f32 (float32_t * __a, float32x2x2_t val)
|
|
|
|
|
{
|
|
|
|
@ -22592,6 +22810,15 @@ vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
|
|
|
|
|
__builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst2q_f16 (float16_t * __a, float16x8x2_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_oi __o;
|
|
|
|
|
__o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
|
|
|
|
|
__builtin_aarch64_st2v8hf (__a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst2q_f32 (float32_t * __a, float32x4x2_t val)
|
|
|
|
|
{
|
|
|
|
@ -22764,6 +22991,20 @@ vst3_u32 (uint32_t * __a, uint32x2x3_t val)
|
|
|
|
|
__builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst3_f16 (float16_t * __a, float16x4x3_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
float16x8x3_t temp;
|
|
|
|
|
temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
|
|
|
|
|
__builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst3_f32 (float32_t * __a, float32x2x3_t val)
|
|
|
|
|
{
|
|
|
|
@ -22878,6 +23119,16 @@ vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
|
|
|
|
|
__builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst3q_f16 (float16_t * __a, float16x8x3_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_ci __o;
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
|
|
|
|
|
__o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
|
|
|
|
|
__builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst3q_f32 (float32_t * __a, float32x4x3_t val)
|
|
|
|
|
{
|
|
|
|
@ -23074,6 +23325,22 @@ vst4_u32 (uint32_t * __a, uint32x2x4_t val)
|
|
|
|
|
__builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst4_f16 (float16_t * __a, float16x4x4_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
float16x8x4_t temp;
|
|
|
|
|
temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
|
|
|
|
|
__builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst4_f32 (float32_t * __a, float32x2x4_t val)
|
|
|
|
|
{
|
|
|
|
@ -23200,6 +23467,17 @@ vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
|
|
|
|
|
__builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst4q_f16 (float16_t * __a, float16x8x4_t val)
|
|
|
|
|
{
|
|
|
|
|
__builtin_aarch64_simd_xi __o;
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
|
|
|
|
|
__o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
|
|
|
|
|
__builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__extension__ static __inline void __attribute__ ((__always_inline__))
|
|
|
|
|
vst4q_f32 (float32_t * __a, float32x4x4_t val)
|
|
|
|
|
{
|
|
|
|
|