arm: ACLE BFloat16 convert intrinsics
This patch is part of a series adding support for Armv8.6-A features. It implements intrinsics to convert between bfloat16 and float32 formats. gcc/ChangeLog: * config/arm/arm_bf16.h (vcvtah_f32_bf16, vcvth_bf16_f32): New. * config/arm/arm_neon.h (vcvt_f32_bf16, vcvtq_low_f32_bf16): New. (vcvtq_high_f32_bf16, vcvt_bf16_f32): New. (vcvtq_low_bf16_f32, vcvtq_high_bf16_f32): New. * config/arm/arm_neon_builtins.def (vbfcvt, vbfcvt_high): New entries. (vbfcvtv4sf, vbfcvtv4sf_high): Likewise. * config/arm/iterators.md (VBFCVT, VBFCVTM): New mode iterators. (V_bf_low, V_bf_cvt_m): New mode attributes. * config/arm/neon.md (neon_vbfcvtv4sf<VBFCVT:mode>): New. (neon_vbfcvtv4sf_highv8bf, neon_vbfcvtsf): New. (neon_vbfcvt<VBFCVT:mode>, neon_vbfcvt_highv8bf): New. (neon_vbfcvtbf_cvtmode<mode>, neon_vbfcvtbf): New * config/arm/unspecs.md (UNSPEC_BFCVT, UNSPEC_BFCVT_HIG): New. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/bf16_cvt_1.c: New test.
This commit is contained in:
parent
9b4f00dd3f
commit
8e6d0dba16
|
@ -1,3 +1,19 @@
|
||||||
|
2020-03-03 Dennis Zhang <dennis.zhang@arm.com>
|
||||||
|
|
||||||
|
* config/arm/arm_bf16.h (vcvtah_f32_bf16, vcvth_bf16_f32): New.
|
||||||
|
* config/arm/arm_neon.h (vcvt_f32_bf16, vcvtq_low_f32_bf16): New.
|
||||||
|
(vcvtq_high_f32_bf16, vcvt_bf16_f32): New.
|
||||||
|
(vcvtq_low_bf16_f32, vcvtq_high_bf16_f32): New.
|
||||||
|
* config/arm/arm_neon_builtins.def (vbfcvt, vbfcvt_high): New entries.
|
||||||
|
(vbfcvtv4sf, vbfcvtv4sf_high): Likewise.
|
||||||
|
* config/arm/iterators.md (VBFCVT, VBFCVTM): New mode iterators.
|
||||||
|
(V_bf_low, V_bf_cvt_m): New mode attributes.
|
||||||
|
* config/arm/neon.md (neon_vbfcvtv4sf<VBFCVT:mode>): New.
|
||||||
|
(neon_vbfcvtv4sf_highv8bf, neon_vbfcvtsf): New.
|
||||||
|
(neon_vbfcvt<VBFCVT:mode>, neon_vbfcvt_highv8bf): New.
|
||||||
|
(neon_vbfcvtbf_cvtmode<mode>, neon_vbfcvtbf): New
|
||||||
|
* config/arm/unspecs.md (UNSPEC_BFCVT, UNSPEC_BFCVT_HIG): New.
|
||||||
|
|
||||||
2020-03-03 Jakub Jelinek <jakub@redhat.com>
|
2020-03-03 Jakub Jelinek <jakub@redhat.com>
|
||||||
|
|
||||||
PR tree-optimization/93582
|
PR tree-optimization/93582
|
||||||
|
|
|
@ -34,6 +34,20 @@ extern "C" {
|
||||||
typedef __bf16 bfloat16_t;
|
typedef __bf16 bfloat16_t;
|
||||||
typedef float float32_t;
|
typedef float float32_t;
|
||||||
|
|
||||||
|
__extension__ extern __inline float32_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvtah_f32_bf16 (bfloat16_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtbf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__extension__ extern __inline bfloat16_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvth_bf16_f32 (float32_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtsf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -19379,6 +19379,55 @@ vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
|
||||||
|
|
||||||
#pragma GCC pop_options
|
#pragma GCC pop_options
|
||||||
|
|
||||||
|
#pragma GCC push_options
|
||||||
|
#pragma GCC target ("arch=armv8.2-a+bf16")
|
||||||
|
|
||||||
|
__extension__ extern __inline float32x4_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvt_f32_bf16 (bfloat16x4_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtv4bf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__extension__ extern __inline float32x4_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvtq_low_f32_bf16 (bfloat16x8_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtv8bf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__extension__ extern __inline float32x4_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvtq_high_f32_bf16 (bfloat16x8_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvt_highv8bf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__extension__ extern __inline bfloat16x4_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvt_bf16_f32 (float32x4_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtv4sfv4bf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__extension__ extern __inline bfloat16x8_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvtq_low_bf16_f32 (float32x4_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtv4sfv8bf (__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The 'inactive' operand is not converted but it provides the
|
||||||
|
low 64 bits to assemble the final 128-bit result. */
|
||||||
|
__extension__ extern __inline bfloat16x8_t
|
||||||
|
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||||
|
vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t __a)
|
||||||
|
{
|
||||||
|
return __builtin_neon_vbfcvtv4sf_highv8bf (inactive, __a);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma GCC pop_options
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -385,3 +385,9 @@ VAR1 (USTERNOP, usmmla, v16qi)
|
||||||
VAR2 (TERNOP, vbfdot, v2sf, v4sf)
|
VAR2 (TERNOP, vbfdot, v2sf, v4sf)
|
||||||
VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
|
VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
|
||||||
VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
|
VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
|
||||||
|
|
||||||
|
VAR2 (UNOP, vbfcvt, sf, bf)
|
||||||
|
VAR2 (UNOP, vbfcvt, v4bf, v8bf)
|
||||||
|
VAR1 (UNOP, vbfcvt_high, v8bf)
|
||||||
|
VAR2 (UNOP, vbfcvtv4sf, v4bf, v8bf)
|
||||||
|
VAR1 (BINOP, vbfcvtv4sf_high, v8bf)
|
||||||
|
|
|
@ -229,6 +229,10 @@
|
||||||
;; Modes for polynomial or float values.
|
;; Modes for polynomial or float values.
|
||||||
(define_mode_iterator VPF [V8QI V16QI V2SF V4SF])
|
(define_mode_iterator VPF [V8QI V16QI V2SF V4SF])
|
||||||
|
|
||||||
|
;; Modes for BF16 convert instructions.
|
||||||
|
(define_mode_iterator VBFCVT [V4BF V8BF])
|
||||||
|
(define_mode_iterator VBFCVTM [V2SI SF])
|
||||||
|
|
||||||
;;----------------------------------------------------------------------------
|
;;----------------------------------------------------------------------------
|
||||||
;; Code iterators
|
;; Code iterators
|
||||||
;;----------------------------------------------------------------------------
|
;;----------------------------------------------------------------------------
|
||||||
|
@ -747,6 +751,12 @@
|
||||||
(V2SF "") (V4SF "")
|
(V2SF "") (V4SF "")
|
||||||
(DI "_neon") (V2DI "")])
|
(DI "_neon") (V2DI "")])
|
||||||
|
|
||||||
|
;; To select the low 64 bits of a vector.
|
||||||
|
(define_mode_attr V_bf_low [(V4BF "P") (V8BF "e")])
|
||||||
|
|
||||||
|
;; To generate intermediate modes for BF16 scalar convert.
|
||||||
|
(define_mode_attr V_bf_cvt_m [(V2SI "BF") (SF "V2SI")])
|
||||||
|
|
||||||
|
|
||||||
;; Scalars to be presented to scalar multiplication instructions
|
;; Scalars to be presented to scalar multiplication instructions
|
||||||
;; must satisfy the following constraints.
|
;; must satisfy the following constraints.
|
||||||
|
|
|
@ -6660,3 +6660,80 @@ if (BYTES_BIG_ENDIAN)
|
||||||
}
|
}
|
||||||
[(set_attr "type" "neon_dot<q>")]
|
[(set_attr "type" "neon_dot<q>")]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
(define_insn "neon_vbfcvtv4sf<VBFCVT:mode>"
|
||||||
|
[(set (match_operand:VBFCVT 0 "register_operand" "=w")
|
||||||
|
(unspec:VBFCVT [(match_operand:V4SF 1 "register_operand" "w")]
|
||||||
|
UNSPEC_BFCVT))]
|
||||||
|
"TARGET_BF16_SIMD"
|
||||||
|
"vcvt.bf16.f32\\t%<V_bf_low>0, %q1"
|
||||||
|
[(set_attr "type" "neon_fp_cvt_narrow_s_q")]
|
||||||
|
)
|
||||||
|
|
||||||
|
(define_insn "neon_vbfcvtv4sf_highv8bf"
|
||||||
|
[(set (match_operand:V8BF 0 "register_operand" "=w")
|
||||||
|
(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
|
||||||
|
(match_operand:V4SF 2 "register_operand" "w")]
|
||||||
|
UNSPEC_BFCVT_HIGH))]
|
||||||
|
"TARGET_BF16_SIMD"
|
||||||
|
"vcvt.bf16.f32\\t%f0, %q2"
|
||||||
|
[(set_attr "type" "neon_fp_cvt_narrow_s_q")]
|
||||||
|
)
|
||||||
|
|
||||||
|
(define_insn "neon_vbfcvtsf"
|
||||||
|
[(set (match_operand:BF 0 "register_operand" "=t")
|
||||||
|
(unspec:BF [(match_operand:SF 1 "register_operand" "t")]
|
||||||
|
UNSPEC_BFCVT))]
|
||||||
|
"TARGET_BF16_FP"
|
||||||
|
"vcvtb.bf16.f32\\t%0, %1"
|
||||||
|
[(set_attr "type" "f_cvt")]
|
||||||
|
)
|
||||||
|
|
||||||
|
(define_insn "neon_vbfcvt<VBFCVT:mode>"
|
||||||
|
[(set (match_operand:V4SF 0 "register_operand" "=w")
|
||||||
|
(unspec:V4SF [(match_operand:VBFCVT 1 "register_operand" "w")]
|
||||||
|
UNSPEC_BFCVT))]
|
||||||
|
"TARGET_BF16_SIMD"
|
||||||
|
"vshll.u32\\t%q0, %<V_bf_low>1, #16"
|
||||||
|
[(set_attr "type" "neon_shift_imm_q")]
|
||||||
|
)
|
||||||
|
|
||||||
|
(define_insn "neon_vbfcvt_highv8bf"
|
||||||
|
[(set (match_operand:V4SF 0 "register_operand" "=w")
|
||||||
|
(unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")]
|
||||||
|
UNSPEC_BFCVT_HIGH))]
|
||||||
|
"TARGET_BF16_SIMD"
|
||||||
|
"vshll.u32\\t%q0, %f1, #16"
|
||||||
|
[(set_attr "type" "neon_shift_imm_q")]
|
||||||
|
)
|
||||||
|
|
||||||
|
;; Convert a BF scalar operand to SF via VSHL.
|
||||||
|
;; VSHL doesn't accept 32-bit registers where the BF and SF scalar operands
|
||||||
|
;; would be allocated, therefore the operands must be converted to intermediate
|
||||||
|
;; vectors (i.e. V2SI) in order to apply 64-bit registers.
|
||||||
|
(define_expand "neon_vbfcvtbf"
|
||||||
|
[(match_operand:SF 0 "register_operand")
|
||||||
|
(unspec:SF [(match_operand:BF 1 "register_operand")] UNSPEC_BFCVT)]
|
||||||
|
"TARGET_BF16_FP"
|
||||||
|
{
|
||||||
|
rtx op0 = gen_reg_rtx (V2SImode);
|
||||||
|
rtx op1 = gen_reg_rtx (V2SImode);
|
||||||
|
emit_insn (gen_neon_vbfcvtbf_cvtmodev2si (op1, operands[1]));
|
||||||
|
emit_insn (gen_neon_vshl_nv2si (op0, op1, gen_int_mode(16, SImode)));
|
||||||
|
emit_insn (gen_neon_vbfcvtbf_cvtmodesf (operands[0], op0));
|
||||||
|
DONE;
|
||||||
|
})
|
||||||
|
|
||||||
|
;; Convert BF mode to V2SI and V2SI to SF.
|
||||||
|
;; Implement this by allocating a 32-bit operand in the low half of a 64-bit
|
||||||
|
;; register indexed by a 32-bit sub-register number.
|
||||||
|
;; This will generate reloads but compiler can optimize out the moves.
|
||||||
|
;; Use 'x' constraint to guarantee the 32-bit sub-registers in an indexable
|
||||||
|
;; range so that to avoid extra moves.
|
||||||
|
(define_insn "neon_vbfcvtbf_cvtmode<mode>"
|
||||||
|
[(set (match_operand:VBFCVTM 0 "register_operand" "=x")
|
||||||
|
(unspec:VBFCVTM [(match_operand:<V_bf_cvt_m> 1 "register_operand" "0")]
|
||||||
|
UNSPEC_BFCVT))]
|
||||||
|
"TARGET_BF16_FP"
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
|
@ -506,4 +506,6 @@
|
||||||
UNSPEC_MATMUL_S
|
UNSPEC_MATMUL_S
|
||||||
UNSPEC_MATMUL_U
|
UNSPEC_MATMUL_U
|
||||||
UNSPEC_MATMUL_US
|
UNSPEC_MATMUL_US
|
||||||
|
UNSPEC_BFCVT
|
||||||
|
UNSPEC_BFCVT_HIGH
|
||||||
])
|
])
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
2020-03-03 Dennis Zhang <dennis.zhang@arm.com>
|
||||||
|
|
||||||
|
* gcc.target/arm/simd/bf16_cvt_1.c: New test.
|
||||||
|
|
||||||
2020-03-03 Jakub Jelinek <jakub@redhat.com>
|
2020-03-03 Jakub Jelinek <jakub@redhat.com>
|
||||||
|
|
||||||
PR tree-optimization/93582
|
PR tree-optimization/93582
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
/* { dg-do assemble } */
|
||||||
|
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
|
||||||
|
/* { dg-options "-save-temps -O2" } */
|
||||||
|
/* { dg-add-options arm_v8_2a_bf16_neon } */
|
||||||
|
|
||||||
|
#include "arm_neon.h"
|
||||||
|
|
||||||
|
float32_t test_vcvtah_f32_bf16 (bfloat16_t a)
|
||||||
|
{
|
||||||
|
return vcvtah_f32_bf16 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
bfloat16_t test_vcvth_bf16_f32 (float32_t a)
|
||||||
|
{
|
||||||
|
return vcvth_bf16_f32 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a)
|
||||||
|
{
|
||||||
|
return vcvt_f32_bf16 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a)
|
||||||
|
{
|
||||||
|
return vcvtq_low_f32_bf16 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a)
|
||||||
|
{
|
||||||
|
return vcvtq_high_f32_bf16 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
bfloat16x4_t test_vcvt_bf16_f32 (float32x4_t a)
|
||||||
|
{
|
||||||
|
return vcvt_bf16_f32 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
bfloat16x8_t test_vcvtq_low_bf16_f32 (float32x4_t a)
|
||||||
|
{
|
||||||
|
return vcvtq_low_bf16_f32 (a);
|
||||||
|
}
|
||||||
|
|
||||||
|
bfloat16x8_t test_vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t a)
|
||||||
|
{
|
||||||
|
return vcvtq_high_bf16_f32 (inactive, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* { dg-final { scan-assembler-times {vcvtb.bf16.f32\ts[0-9]+, s[0-9]+\n} 1 } } */
|
||||||
|
/* { dg-final { scan-assembler-times {vcvt.bf16.f32\td[0-9]+, q[0-9]+\n} 3 } } */
|
||||||
|
/* { dg-final { scan-assembler-times {vshl.i32\td[0-9]+, d[0-9]+, #16} 1 } } */
|
||||||
|
/* { dg-final { scan-assembler-times {vshll.u32\tq[0-9]+, d[0-9]+, #16} 3 } } */
|
Loading…
Reference in New Issue