arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics

gcc/ChangeLog

2020-10-29  Andrea Corallo  <andrea.corallo@arm.com>

	* config/arm/arm_neon.h (vst2_lane_bf16, vst2q_lane_bf16)
	(vst3_lane_bf16, vst3q_lane_bf16, vst4_lane_bf16)
	(vst4q_lane_bf16): New intrinsics.
	* config/arm/arm_neon_builtins.def: Touch it for:
	__builtin_neon_vst2_lanev4bf, __builtin_neon_vst2_lanev8bf,
	__builtin_neon_vst3_lanev4bf, __builtin_neon_vst3_lanev8bf,
	__builtin_neon_vst4_lanev4bf,__builtin_neon_vst4_lanev8bf.

gcc/testsuite/ChangeLog

2020-10-29  Andrea Corallo  <andrea.corallo@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c:
	Run it also for arm-*-*.
	* gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c:
	Likewise.
	* gcc.target/arm/simd/vstn_lane_bf16_1.c: New test.
This commit is contained in:
Andrea Corallo 2020-10-29 11:20:23 +01:00
parent 1528f34341
commit ed62f3668b
9 changed files with 133 additions and 12 deletions

View File

@ -19783,6 +19783,54 @@ vld4q_lane_bf16 (const bfloat16_t * __a, bfloat16x8x4_t __b, const int __c)
return __rv.__i;
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst2_lane_bf16 (bfloat16_t * __a, bfloat16x4x2_t __b, const int __c)
{
union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
__builtin_neon_vst2_lanev4bf (__a, __bu.__o, __c);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst2q_lane_bf16 (bfloat16_t * __a, bfloat16x8x2_t __b, const int __c)
{
union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
__builtin_neon_vst2_lanev8bf (__a, __bu.__o, __c);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst3_lane_bf16 (bfloat16_t * __a, bfloat16x4x3_t __b, const int __c)
{
union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
__builtin_neon_vst3_lanev4bf (__a, __bu.__o, __c);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst3q_lane_bf16 (bfloat16_t * __a, bfloat16x8x3_t __b, const int __c)
{
union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
__builtin_neon_vst3_lanev8bf (__a, __bu.__o, __c);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst4_lane_bf16 (bfloat16_t * __a, bfloat16x4x4_t __b, const int __c)
{
union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
__builtin_neon_vst4_lanev4bf (__a, __bu.__o, __c);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst4q_lane_bf16 (bfloat16_t * __a, bfloat16x8x4_t __b, const int __c)
{
union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
__builtin_neon_vst4_lanev8bf (__a, __bu.__o, __c);
}
#pragma GCC pop_options
#ifdef __cplusplus

View File

@ -329,8 +329,8 @@ VAR11 (LOAD1LANE, vld2_lane,
VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst2,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (STORE1LANE, vst2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR13 (LOAD1, vld3,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR11 (LOAD1LANE, vld3_lane,
@ -338,8 +338,8 @@ VAR11 (LOAD1LANE, vld3_lane,
VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst3,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (STORE1LANE, vst3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR13 (LOAD1, vld4,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR11 (LOAD1LANE, vld4_lane,
@ -347,8 +347,8 @@ VAR11 (LOAD1LANE, vld4_lane,
VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst4,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (STORE1LANE, vst4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR2 (TERNOP, sdot, v8qi, v16qi)
VAR2 (UTERNOP, udot, v8qi, v16qi)
VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -1,4 +1,4 @@
/* { dg-do compile { target { aarch64*-*-* } } } */
/* { dg-do compile } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */

View File

@ -0,0 +1,73 @@
/* { dg-do assemble } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */
/* { dg-final { check-function-bodies "**" "" } } */
#include "arm_neon.h"
/*
**test_vst2_lane_bf16:
** vst2.16 {d0\[2\], d1\[2\]}, \[r0\]
** bx lr
*/
void
test_vst2_lane_bf16 (bfloat16_t *a, bfloat16x4x2_t b)
{
return vst2_lane_bf16 (a, b, 2);
}
/*
**test_vst2q_lane_bf16:
** vst2.16 {d0\[2\], d2\[2\]}, \[r0\]
** bx lr
*/
void
test_vst2q_lane_bf16 (bfloat16_t *a, bfloat16x8x2_t b)
{
return vst2q_lane_bf16 (a, b, 2);
}
/*
**test_vst3_lane_bf16:
** vst3.16 {d0\[2\], d1\[2\], d2\[2\]}, \[r0\]
** bx lr
*/
void
test_vst3_lane_bf16 (bfloat16_t *a, bfloat16x4x3_t b)
{
return vst3_lane_bf16 (a, b, 2);
}
/*
**test_vst3q_lane_bf16:
** vst3.16 {d0\[2\], d2\[2\], d4\[2\]}, \[r0\]
** bx lr
*/
void
test_vst3q_lane_bf16 (bfloat16_t *a, bfloat16x8x3_t b)
{
return vst3q_lane_bf16 (a, b, 2);
}
/*
**test_vst4_lane_bf16:
** vst4.16 {d0\[2\], d1\[2\], d2\[2\], d3\[2\]}, \[r0\]
** bx lr
*/
void
test_vst4_lane_bf16 (bfloat16_t *a, bfloat16x4x4_t b)
{
return vst4_lane_bf16 (a, b, 2);
}
/*
**test_vst4q_lane_bf16:
** vst4.16 {d0\[2\], d2\[2\], d4\[2\], d6\[2\]}, \[r0\]
** bx lr
*/
void
test_vst4q_lane_bf16 (bfloat16_t *a, bfloat16x8x4_t b)
{
return vst4q_lane_bf16 (a, b, 2);
}