arm: Replace arm_builtin_vectorized_function [PR106253]

This patch extends the fix for PR106253 to AArch32.  As with AArch64,
we were using ACLE intrinsics to vectorise scalar built-ins, even
though the two sometimes have different ECF_* flags.  (That in turn
is because the ACLE intrinsics should follow the instruction semantics
as closely as possible, whereas the scalar built-ins follow language
specs.)

The patch also removes the copysignf built-in, which only existed
for this purpose and wasn't a “real” arm_neon.h built-in.

Doing this also has the side-effect of enabling vectorisation of
rint and roundeven.  Logically that should be a separate patch,
but making it one would have meant adding a new int iterator
for the original set of instructions and then removing it again
when including new functions.

I've restricted the bswap tests to little-endian because we end
up with excessive spilling on big-endian.  E.g.:

        sub     sp, sp, #8
        vstr    d1, [sp]
        vldr    d16, [sp]
        vrev16.8        d16, d16
        vstr    d16, [sp]
        vldr    d0, [sp]
        add     sp, sp, #8
        @ sp needed
        bx      lr

Similarly, the copysign tests require little-endian because on
big-endian we unnecessarily load the constant from the constant pool:

        vldr.32 s15, .L3
        vdup.32 d0, d7[1]
        vbsl    d0, d2, d1
        bx      lr
.L3:
        .word   -2147483648

gcc/
	PR target/106253
	* config/arm/arm-builtins.cc (arm_builtin_vectorized_function):
	Delete.
	* config/arm/arm-protos.h (arm_builtin_vectorized_function): Delete.
	* config/arm/arm.cc (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION):
	Delete.
	* config/arm/arm_neon_builtins.def (copysignf): Delete.
	* config/arm/iterators.md (nvrint_pattern): New attribute.
	* config/arm/neon.md (<NEON_VRINT:nvrint_pattern><VCVTF:mode>2):
	New pattern.
	(l<NEON_VCVT:nvrint_pattern><su_optab><VCVTF:mode><v_cmp_result>2):
	Likewise.
	(neon_copysignf<mode>): Rename to...
	(copysign<mode>3): ...this.

gcc/testsuite/
	PR target/106253
	* gcc.target/arm/vect_unary_1.c: New test.
	* gcc.target/arm/vect_binary_1.c: Likewise.
This commit is contained in:
Richard Sandiford 2022-07-18 12:57:10 +01:00
parent 9c8349ee1a
commit 7313381d2c
8 changed files with 297 additions and 130 deletions

View File

@ -4026,129 +4026,6 @@ arm_expand_builtin (tree exp,
return NULL_RTX;
}
tree
arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in)
{
machine_mode in_mode, out_mode;
int in_n, out_n;
bool out_unsigned_p = TYPE_UNSIGNED (type_out);
/* Can't provide any vectorized builtins when we can't use NEON. */
if (!TARGET_NEON)
return NULL_TREE;
if (TREE_CODE (type_out) != VECTOR_TYPE
|| TREE_CODE (type_in) != VECTOR_TYPE)
return NULL_TREE;
out_mode = TYPE_MODE (TREE_TYPE (type_out));
out_n = TYPE_VECTOR_SUBPARTS (type_out);
in_mode = TYPE_MODE (TREE_TYPE (type_in));
in_n = TYPE_VECTOR_SUBPARTS (type_in);
/* ARM_CHECK_BUILTIN_MODE and ARM_FIND_VRINT_VARIANT are used to find the
decl of the vectorized builtin for the appropriate vector mode.
NULL_TREE is returned if no such builtin is available. */
#undef ARM_CHECK_BUILTIN_MODE
#define ARM_CHECK_BUILTIN_MODE(C) \
(TARGET_VFP5 \
&& flag_unsafe_math_optimizations \
&& ARM_CHECK_BUILTIN_MODE_1 (C))
#undef ARM_CHECK_BUILTIN_MODE_1
#define ARM_CHECK_BUILTIN_MODE_1(C) \
(out_mode == SFmode && out_n == C \
&& in_mode == SFmode && in_n == C)
#undef ARM_FIND_VRINT_VARIANT
#define ARM_FIND_VRINT_VARIANT(N) \
(ARM_CHECK_BUILTIN_MODE (2) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sf, false) \
: (ARM_CHECK_BUILTIN_MODE (4) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sf, false) \
: NULL_TREE))
switch (fn)
{
CASE_CFN_FLOOR:
return ARM_FIND_VRINT_VARIANT (vrintm);
CASE_CFN_CEIL:
return ARM_FIND_VRINT_VARIANT (vrintp);
CASE_CFN_TRUNC:
return ARM_FIND_VRINT_VARIANT (vrintz);
CASE_CFN_ROUND:
return ARM_FIND_VRINT_VARIANT (vrinta);
#undef ARM_CHECK_BUILTIN_MODE_1
#define ARM_CHECK_BUILTIN_MODE_1(C) \
(out_mode == SImode && out_n == C \
&& in_mode == SFmode && in_n == C)
#define ARM_FIND_VCVT_VARIANT(N) \
(ARM_CHECK_BUILTIN_MODE (2) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sfv2si, false) \
: (ARM_CHECK_BUILTIN_MODE (4) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sfv4si, false) \
: NULL_TREE))
#define ARM_FIND_VCVTU_VARIANT(N) \
(ARM_CHECK_BUILTIN_MODE (2) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv2sfv2si, false) \
: (ARM_CHECK_BUILTIN_MODE (4) \
? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv4sfv4si, false) \
: NULL_TREE))
CASE_CFN_LROUND:
return (out_unsigned_p
? ARM_FIND_VCVTU_VARIANT (vcvta)
: ARM_FIND_VCVT_VARIANT (vcvta));
CASE_CFN_LCEIL:
return (out_unsigned_p
? ARM_FIND_VCVTU_VARIANT (vcvtp)
: ARM_FIND_VCVT_VARIANT (vcvtp));
CASE_CFN_LFLOOR:
return (out_unsigned_p
? ARM_FIND_VCVTU_VARIANT (vcvtm)
: ARM_FIND_VCVT_VARIANT (vcvtm));
#undef ARM_CHECK_BUILTIN_MODE
#define ARM_CHECK_BUILTIN_MODE(C, N) \
(out_mode == N##mode && out_n == C \
&& in_mode == N##mode && in_n == C)
case CFN_BUILT_IN_BSWAP16:
if (ARM_CHECK_BUILTIN_MODE (4, HI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false);
else if (ARM_CHECK_BUILTIN_MODE (8, HI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false);
else
return NULL_TREE;
case CFN_BUILT_IN_BSWAP32:
if (ARM_CHECK_BUILTIN_MODE (2, SI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false);
else if (ARM_CHECK_BUILTIN_MODE (4, SI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false);
else
return NULL_TREE;
case CFN_BUILT_IN_BSWAP64:
if (ARM_CHECK_BUILTIN_MODE (2, DI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false);
else
return NULL_TREE;
CASE_CFN_COPYSIGN:
if (ARM_CHECK_BUILTIN_MODE (2, SF))
return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false);
else if (ARM_CHECK_BUILTIN_MODE (4, SF))
return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false);
else
return NULL_TREE;
default:
return NULL_TREE;
}
return NULL_TREE;
}
#undef ARM_FIND_VCVT_VARIANT
#undef ARM_FIND_VCVTU_VARIANT
#undef ARM_CHECK_BUILTIN_MODE
#undef ARM_FIND_VRINT_VARIANT
void
arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
{

View File

@ -103,7 +103,6 @@ extern void neon_pairwise_reduce (rtx, rtx, machine_mode,
rtx (*) (rtx, rtx, rtx));
extern rtx mve_bool_vec_to_const (rtx const_vec);
extern rtx neon_make_constant (rtx, bool generate = true);
extern tree arm_builtin_vectorized_function (unsigned int, tree, tree);
extern void neon_expand_vector_init (rtx, rtx);
extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree);
extern void arm_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT);

View File

@ -739,10 +739,6 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_VECTORIZE_BUILTINS
#define TARGET_VECTORIZE_BUILTINS
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
arm_builtin_vectorized_function
#undef TARGET_VECTOR_ALIGNMENT
#define TARGET_VECTOR_ALIGNMENT arm_vector_alignment

View File

@ -264,7 +264,6 @@ VAR1 (UNOP, vcvtv4hf, v4sf)
VAR10 (TERNOP, vbsl,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
VAR2 (TERNOP, vbsl, v8hf, v4hf)
VAR2 (UNOP, copysignf, v2sf, v4sf)
VAR2 (UNOP, vrintn, v2sf, v4sf)
VAR2 (UNOP, vrinta, v2sf, v4sf)
VAR2 (UNOP, vrintp, v2sf, v4sf)

View File

@ -1150,6 +1150,13 @@
(UNSPEC_VRINTA "unconditional") (UNSPEC_VRINTM "unconditional")
(UNSPEC_VRINTR "nocond") (UNSPEC_VRINTX "nocond")])
(define_int_attr nvrint_pattern [(UNSPEC_NVRINTZ "btrunc")
(UNSPEC_NVRINTP "ceil")
(UNSPEC_NVRINTA "round")
(UNSPEC_NVRINTM "floor")
(UNSPEC_NVRINTX "rint")
(UNSPEC_NVRINTN "roundeven")])
(define_int_attr nvrint_variant [(UNSPEC_NVRINTZ "z") (UNSPEC_NVRINTP "p")
(UNSPEC_NVRINTA "a") (UNSPEC_NVRINTM "m")
(UNSPEC_NVRINTX "x") (UNSPEC_NVRINTN "n")])

View File

@ -635,6 +635,13 @@
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_expand "<NEON_VRINT:nvrint_pattern><VCVTF:mode>2"
[(set (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand")]
NEON_VRINT))]
"TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations"
)
(define_insn "neon_vrint<NEON_VRINT:nvrint_variant><VCVTF:mode>"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(unspec:VCVTF [(match_operand:VCVTF 1
@ -645,6 +652,14 @@
[(set_attr "type" "neon_fp_round_<V_elem_ch><q>")]
)
(define_expand "l<NEON_VCVT:nvrint_pattern><su_optab><VCVTF:mode><v_cmp_result>2"
[(set (match_operand:<V_cmp_result> 0 "register_operand")
(FIXUORS:<V_cmp_result>
(unspec:VCVTF [(match_operand:VCVTF 1 "register_operand")]
NEON_VCVT)))]
"TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations"
)
(define_insn "neon_vcvt<NEON_VCVT:nvrint_variant><su_optab><VCVTF:mode><v_cmp_result>"
[(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
(FIXUORS:<V_cmp_result> (unspec:VCVTF
@ -3059,7 +3074,7 @@
"TARGET_I8MM"
)
(define_expand "neon_copysignf<mode>"
(define_expand "copysign<mode>3"
[(match_operand:VCVTF 0 "register_operand")
(match_operand:VCVTF 1 "register_operand")
(match_operand:VCVTF 2 "register_operand")]

View File

@ -0,0 +1,50 @@
/* { dg-do compile { target { arm*-*-* } } } */
/* { dg-require-effective-target arm_hard_ok } */
/* { dg-require-effective-target arm_v8_neon_ok } */
/* { dg-add-options arm_v8_neon } */
/* { dg-additional-options "-O3 -mfloat-abi=hard" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <stdint.h>
#define TEST2(OUT, NAME, IN) \
OUT __attribute__((vector_size(sizeof(OUT) * 2))) \
test2_##OUT##_##NAME##_##IN (float dummy, \
IN __attribute__((vector_size(sizeof(IN) * 2))) y, \
IN __attribute__((vector_size(sizeof(IN) * 2))) z) \
{ \
OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \
x[0] = __builtin_##NAME (y[0], z[0]); \
x[1] = __builtin_##NAME (y[1], z[1]); \
return x; \
}
#define TEST4(OUT, NAME, IN) \
OUT __attribute__((vector_size(sizeof(OUT) * 4))) \
test4_##OUT##_##NAME##_##IN (float dummy, \
IN __attribute__((vector_size(sizeof(OUT) * 4))) y, \
IN __attribute__((vector_size(sizeof(OUT) * 4))) z) \
{ \
OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \
x[0] = __builtin_##NAME (y[0], z[0]); \
x[1] = __builtin_##NAME (y[1], z[1]); \
x[2] = __builtin_##NAME (y[2], z[2]); \
x[3] = __builtin_##NAME (y[3], z[3]); \
return x; \
}
/*
** test2_float_copysignf_float: { target arm_little_endian }
** vmov.i32 d0, #(0x80000000|2147483648)(\s+.*)
** vbsl d0, d2, d1
** bx lr
*/
TEST2 (float, copysignf, float)
/*
** test4_float_copysignf_float: { target arm_little_endian }
** vmov.i32 q0, #(0x80000000|2147483648)(\s+.*)
** vbsl q0, q2, q1
** bx lr
*/
TEST4 (float, copysignf, float)

View File

@ -0,0 +1,224 @@
/* { dg-do compile { target { arm*-*-* } } } */
/* { dg-require-effective-target arm_hard_ok } */
/* { dg-require-effective-target arm_v8_neon_ok } */
/* { dg-add-options arm_v8_neon } */
/* { dg-additional-options "-Ofast -mfloat-abi=hard" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <stdint.h>
#define TEST2(OUT, NAME, IN) \
OUT __attribute__((vector_size(sizeof(OUT) * 2))) \
test2_##OUT##_##NAME##_##IN (float dummy, \
IN __attribute__((vector_size(sizeof(IN) * 2))) y) \
{ \
OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \
x[0] = __builtin_##NAME (y[0]); \
x[1] = __builtin_##NAME (y[1]); \
return x; \
}
#define TEST4(OUT, NAME, IN) \
OUT __attribute__((vector_size(sizeof(OUT) * 4))) \
test4_##OUT##_##NAME##_##IN (float dummy, \
IN __attribute__((vector_size(sizeof(OUT) * 4))) y) \
{ \
OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \
x[0] = __builtin_##NAME (y[0]); \
x[1] = __builtin_##NAME (y[1]); \
x[2] = __builtin_##NAME (y[2]); \
x[3] = __builtin_##NAME (y[3]); \
return x; \
}
#define TEST8(OUT, NAME, IN) \
OUT __attribute__((vector_size(sizeof(OUT) * 8))) \
test8_##OUT##_##NAME##_##IN (float dummy, \
IN __attribute__((vector_size(sizeof(OUT) * 8))) y) \
{ \
OUT __attribute__((vector_size(sizeof(OUT) * 8))) x; \
x[0] = __builtin_##NAME (y[0]); \
x[1] = __builtin_##NAME (y[1]); \
x[2] = __builtin_##NAME (y[2]); \
x[3] = __builtin_##NAME (y[3]); \
x[4] = __builtin_##NAME (y[4]); \
x[5] = __builtin_##NAME (y[5]); \
x[6] = __builtin_##NAME (y[6]); \
x[7] = __builtin_##NAME (y[7]); \
return x; \
}
/*
** test2_float_truncf_float:
** vrintz.f32 d0, d1
** bx lr
*/
TEST2 (float, truncf, float)
/*
** test4_float_truncf_float:
** vrintz.f32 q0, q1
** bx lr
*/
TEST4 (float, truncf, float)
/*
** test2_float_roundf_float:
** vrinta.f32 d0, d1
** bx lr
*/
TEST2 (float, roundf, float)
/*
** test4_float_roundf_float:
** vrinta.f32 q0, q1
** bx lr
*/
TEST4 (float, roundf, float)
/*
** test2_float_floorf_float:
** vrintm.f32 d0, d1
** bx lr
*/
TEST2 (float, floorf, float)
/*
** test4_float_floorf_float:
** vrintm.f32 q0, q1
** bx lr
*/
TEST4 (float, floorf, float)
/*
** test2_float_ceilf_float:
** vrintp.f32 d0, d1
** bx lr
*/
TEST2 (float, ceilf, float)
/*
** test4_float_ceilf_float:
** vrintp.f32 q0, q1
** bx lr
*/
TEST4 (float, ceilf, float)
/*
** test2_float_rintf_float:
** vrintx.f32 d0, d1
** bx lr
*/
TEST2 (float, rintf, float)
/*
** test4_float_rintf_float:
** vrintx.f32 q0, q1
** bx lr
*/
TEST4 (float, rintf, float)
/*
** test2_float_roundevenf_float:
** vrintn.f32 d0, d1
** bx lr
*/
TEST2 (float, roundevenf, float)
/*
** test4_float_roundevenf_float:
** vrintn.f32 q0, q1
** bx lr
*/
TEST4 (float, roundevenf, float)
/*
** test2_int_roundf_float:
** vcvta.s32.f32 d0, d1
** bx lr
*/
TEST2 (int, roundf, float)
/*
** test4_int_roundf_float:
** vcvta.s32.f32 q0, q1
** bx lr
*/
TEST4 (int, roundf, float)
/*
** test2_int_floorf_float:
** vcvtm.s32.f32 d0, d1
** bx lr
*/
TEST2 (int, floorf, float)
/*
** test4_int_floorf_float:
** vcvtm.s32.f32 q0, q1
** bx lr
*/
TEST4 (int, floorf, float)
/*
** test2_int_ceilf_float:
** vcvtp.s32.f32 d0, d1
** bx lr
*/
TEST2 (int, ceilf, float)
/*
** test4_int_ceilf_float:
** vcvtp.s32.f32 q0, q1
** bx lr
*/
TEST4 (int, ceilf, float)
/*
** test2_int_clz_int:
** vclz.i32 d0, d1
** bx lr
*/
TEST2 (int, clz, int)
/*
** test4_int_clz_int:
** vclz.i32 q0, q1
** bx lr
*/
TEST4 (int, clz, int)
/*
** test4_int16_t_bswap16_int16_t: { target arm_little_endian }
** vrev16.8 d0, d1
** bx lr
*/
TEST4 (int16_t, bswap16, int16_t)
/*
** test8_int16_t_bswap16_int16_t: { target arm_little_endian }
** vrev16.8 q0, q1
** bx lr
*/
TEST8 (int16_t, bswap16, int16_t)
/*
** test2_int_bswap32_int: { target arm_little_endian }
** vrev32.8 d0, d1
** bx lr
*/
TEST2 (int, bswap32, int)
/*
** test4_int_bswap32_int: { target arm_little_endian }
** vrev32.8 q0, q1
** bx lr
*/
TEST4 (int, bswap32, int)
/*
** test2_int64_t_bswap64_int64_t: { target arm_little_endian }
** vrev64.8 q0, q1
** bx lr
*/
TEST2 (int64_t, bswap64, int64_t)