aarch64: ACLE intrinsics bfmmla and bfmlal<b/t>
This patch adds the ARMv8.6 ACLE intrinsics for bfmmla, bfmlalb and bfmlalt as part of the BFloat16 extension. (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) The intrinsics are declared in arm_neon.h and the RTL patterns are defined in aarch64-simd.md. Two new tests are added to check assembler output. 2020-02-06 Delia Burduv <delia.burduv@arm.com> gcc/ * config/aarch64/aarch64-simd-builtins.def (bfmlaq): New built-in function. (bfmlalb): New built-in function. (bfmlalt): New built-in function. (bfmlalb_lane): New built-in function. (bfmlalt_lane): New built-in function. * config/aarch64/aarch64-simd.md (aarch64_bfmmlaqv4sf): New pattern. (aarch64_bfmlal<bt>v4sf): New pattern. (aarch64_bfmlal<bt>_lane<q>v4sf): New pattern. * config/aarch64/arm_neon.h (vbfmmlaq_f32): New intrinsic. (vbfmlalbq_f32): New intrinsic. (vbfmlaltq_f32): New intrinsic. (vbfmlalbq_lane_f32): New intrinsic. (vbfmlaltq_lane_f32): New intrinsic. (vbfmlalbq_laneq_f32): New intrinsic. (vbfmlaltq_laneq_f32): New intrinsic. * config/aarch64/iterators.md (BF_MLA): New int iterator. (bt): New int attribute.
This commit is contained in:
parent
ad84548336
commit
f78335df69
|
@ -1,3 +1,25 @@
|
|||
2020-02-06 Delia Burduv <delia.burduv@arm.com>
|
||||
|
||||
* config/aarch64/aarch64-simd-builtins.def
|
||||
(bfmlaq): New built-in function.
|
||||
(bfmlalb): New built-in function.
|
||||
(bfmlalt): New built-in function.
|
||||
(bfmlalb_lane): New built-in function.
|
||||
(bfmlalt_lane): New built-in function.
|
||||
* config/aarch64/aarch64-simd.md
|
||||
(aarch64_bfmmlaqv4sf): New pattern.
|
||||
(aarch64_bfmlal<bt>v4sf): New pattern.
|
||||
(aarch64_bfmlal<bt>_lane<q>v4sf): New pattern.
|
||||
* config/aarch64/arm_neon.h (vbfmmlaq_f32): New intrinsic.
|
||||
(vbfmlalbq_f32): New intrinsic.
|
||||
(vbfmlaltq_f32): New intrinsic.
|
||||
(vbfmlalbq_lane_f32): New intrinsic.
|
||||
(vbfmlaltq_lane_f32): New intrinsic.
|
||||
(vbfmlalbq_laneq_f32): New intrinsic.
|
||||
(vbfmlaltq_laneq_f32): New intrinsic.
|
||||
* config/aarch64/iterators.md (BF_MLA): New int iterator.
|
||||
(bt): New int attribute.
|
||||
|
||||
2020-02-06 Uroš Bizjak <ubizjak@gmail.com>
|
||||
|
||||
* config/i386/i386.md (*pushtf): Emit "#" instead of
|
||||
|
|
|
@ -692,3 +692,14 @@
|
|||
VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
|
||||
VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
|
||||
VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
|
||||
|
||||
/* Implemented by aarch64_bfmmlaqv4sf */
|
||||
VAR1 (TERNOP, bfmmlaq, 0, v4sf)
|
||||
|
||||
/* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf */
|
||||
VAR1 (TERNOP, bfmlalb, 0, v4sf)
|
||||
VAR1 (TERNOP, bfmlalt, 0, v4sf)
|
||||
VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
|
||||
VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
|
||||
VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
|
||||
VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
|
||||
|
|
|
@ -7134,3 +7134,42 @@
|
|||
}
|
||||
[(set_attr "type" "neon_dot<VDQSF:q>")]
|
||||
)
|
||||
|
||||
;; bfmmla
|
||||
(define_insn "aarch64_bfmmlaqv4sf"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=w")
|
||||
(plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
|
||||
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
|
||||
(match_operand:V8BF 3 "register_operand" "w")]
|
||||
UNSPEC_BFMMLA)))]
|
||||
"TARGET_BF16_SIMD"
|
||||
"bfmmla\\t%0.4s, %2.8h, %3.8h"
|
||||
[(set_attr "type" "neon_fp_mla_s_q")]
|
||||
)
|
||||
|
||||
;; bfmlal<bt>
|
||||
(define_insn "aarch64_bfmlal<bt>v4sf"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=w")
|
||||
(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
|
||||
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
|
||||
(match_operand:V8BF 3 "register_operand" "w")]
|
||||
BF_MLA)))]
|
||||
"TARGET_BF16_SIMD"
|
||||
"bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
|
||||
[(set_attr "type" "neon_fp_mla_s_q")]
|
||||
)
|
||||
|
||||
(define_insn "aarch64_bfmlal<bt>_lane<q>v4sf"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=w")
|
||||
(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
|
||||
(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
|
||||
(match_operand:VBF 3 "register_operand" "w")
|
||||
(match_operand:SI 4 "const_int_operand" "n")]
|
||||
BF_MLA)))]
|
||||
"TARGET_BF16_SIMD"
|
||||
{
|
||||
operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
|
||||
return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
|
||||
}
|
||||
[(set_attr "type" "neon_fp_mla_s_scalar_q")]
|
||||
)
|
||||
|
|
|
@ -34660,6 +34660,60 @@ vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
|
|||
return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
|
||||
|
||||
{
|
||||
return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
|
||||
{
|
||||
return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
|
||||
{
|
||||
return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
|
||||
const int __index)
|
||||
{
|
||||
return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
|
||||
const int __index)
|
||||
{
|
||||
return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
|
||||
const int __index)
|
||||
{
|
||||
return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
|
||||
const int __index)
|
||||
{
|
||||
return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
|
||||
}
|
||||
|
||||
#pragma GCC pop_options
|
||||
|
||||
/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
|
||||
|
|
|
@ -2620,6 +2620,9 @@
|
|||
|
||||
(define_int_iterator FMMLA [UNSPEC_FMMLA])
|
||||
|
||||
(define_int_iterator BF_MLA [UNSPEC_BFMLALB
|
||||
UNSPEC_BFMLALT])
|
||||
|
||||
;; Iterators for atomic operations.
|
||||
|
||||
(define_int_iterator ATOMIC_LDOP
|
||||
|
@ -2871,6 +2874,8 @@
|
|||
(define_int_attr ab [(UNSPEC_CLASTA "a") (UNSPEC_CLASTB "b")
|
||||
(UNSPEC_LASTA "a") (UNSPEC_LASTB "b")])
|
||||
|
||||
(define_int_attr bt [(UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
|
||||
|
||||
(define_int_attr addsub [(UNSPEC_SHADD "add")
|
||||
(UNSPEC_UHADD "add")
|
||||
(UNSPEC_SRHADD "add")
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/* { dg-do assemble { target { aarch64*-*-* } } } */
|
||||
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
|
||||
/* { dg-add-options arm_v8_2a_bf16_neon } */
|
||||
/* { dg-additional-options "-save-temps" } */
|
||||
/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*
|
||||
**test_bfmlalb:
|
||||
** bfmlalb v0.4s, v1.8h, v2.8h
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
return vbfmlalbq_f32 (r, a, b);
|
||||
}
|
||||
|
||||
/*
|
||||
**test_bfmlalt:
|
||||
** bfmlalt v0.4s, v1.8h, v2.8h
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
return vbfmlaltq_f32 (r, a, b);
|
||||
}
|
||||
|
||||
/*
|
||||
**test_bfmlalb_lane:
|
||||
** bfmlalb v0.4s, v1.8h, v2.h[0]
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
||||
{
|
||||
return vbfmlalbq_lane_f32 (r, a, b, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
**test_bfmlalt_lane:
|
||||
** bfmlalt v0.4s, v1.8h, v2.h[2]
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
||||
{
|
||||
return vbfmlaltq_lane_f32 (r, a, b, 2);
|
||||
}
|
||||
|
||||
/*
|
||||
**test_bfmlalb_laneq:
|
||||
** bfmlalb v0.4s, v1.8h, v2.h[4]
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
return vbfmlalbq_laneq_f32 (r, a, b, 4);
|
||||
}
|
||||
|
||||
/*
|
||||
**test_bfmlalt_laneq:
|
||||
** bfmlalt v0.4s, v1.8h, v2.h[7]
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
return vbfmlaltq_laneq_f32 (r, a, b, 7);
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
/* { dg-do assemble { target { aarch64*-*-* } } } */
|
||||
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
|
||||
/* { dg-add-options arm_v8_2a_bf16_neon } */
|
||||
/* { dg-additional-options "-save-temps" } */
|
||||
/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/*
|
||||
**test_bfmmla:
|
||||
** bfmmla v0.4s, v1.8h, v2.8h
|
||||
** ret
|
||||
*/
|
||||
float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
|
||||
{
|
||||
return vbfmmlaq_f32 (r, x, y);
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/* { dg-do compile { target { aarch64*-*-* } } } */
|
||||
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
|
||||
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
|
||||
/* { dg-add-options arm_v8_2a_bf16_neon } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void
|
||||
f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
||||
{
|
||||
/* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
|
||||
vbfmlaltq_lane_f32 (r, a, b, -1);
|
||||
/* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
|
||||
vbfmlaltq_lane_f32 (r, a, b, 4);
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
/* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
|
||||
vbfmlaltq_laneq_f32 (r, a, b, -1);
|
||||
/* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
|
||||
vbfmlaltq_laneq_f32 (r, a, b, 8);
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
||||
{
|
||||
/* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
|
||||
vbfmlalbq_lane_f32 (r, a, b, -2);
|
||||
/* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
|
||||
vbfmlalbq_lane_f32 (r, a, b, 5);
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
|
||||
{
|
||||
/* { dg-error "lane -2 out of range 0 - 7" "" { target *-*-* } 0 } */
|
||||
vbfmlalbq_laneq_f32 (r, a, b, -2);
|
||||
/* { dg-error "lane 9 out of range 0 - 7" "" { target *-*-* } 0 } */
|
||||
vbfmlalbq_laneq_f32 (r, a, b, 9);
|
||||
return;
|
||||
}
|
Loading…
Reference in New Issue