aarch64: Reimplement vaddlv* intrinsics using builtins
This patch reimplements the vaddlv* intrinsics using builtins. The vaddlv_s32 and vaddlv_u32 intrinsics actually perform a pairwise SADDLP/UADDLP instead of a SADDLV/UADDLV but because they only use two elements it has the same semantics. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (saddlv, uaddlv): Define builtins. * config/aarch64/aarch64-simd.md (aarch64_<su>addlv<mode>): Define. * config/aarch64/arm_neon.h (vaddlv_s8): Reimplement using builtin. (vaddlv_s16): Likewise. (vaddlv_u8): Likewise. (vaddlv_u16): Likewise. (vaddlvq_s8): Likewise. (vaddlvq_s16): Likewise. (vaddlvq_s32): Likewise. (vaddlvq_u8): Likewise. (vaddlvq_u16): Likewise. (vaddlvq_u32): Likewise. (vaddlv_s32): Likewise. (vaddlv_u32): Likewise. * config/aarch64/iterators.md (VDQV_L): New mode iterator. (unspec): Add UNSPEC_SADDLV, UNSPEC_UADDLV. (Vwstype): New mode attribute. (Vwsuf): Likewise. (VWIDE_S): Likewise. (USADDLV): New int iterator. (su): Handle UNSPEC_SADDLV, UNSPEC_UADDLV. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vaddlv_1.c: New test.
This commit is contained in:
parent
e053f96a9f
commit
cb995de62a
@ -149,6 +149,10 @@
|
||||
BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE)
|
||||
BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE)
|
||||
|
||||
/* Implemented by aarch64_<su>addlv<mode>. */
|
||||
BUILTIN_VDQV_L (UNOP, saddlv, 0, NONE)
|
||||
BUILTIN_VDQV_L (UNOPU, uaddlv, 0, NONE)
|
||||
|
||||
/* Implemented by aarch64_<su>abd<mode>. */
|
||||
BUILTIN_VDQ_BHSI (BINOP, sabd, 0, NONE)
|
||||
BUILTIN_VDQ_BHSI (BINOPU, uabd, 0, NONE)
|
||||
|
@ -2695,6 +2695,15 @@
|
||||
[(set_attr "type" "neon_reduc_add<q>")]
|
||||
)
|
||||
|
||||
(define_insn "aarch64_<su>addlv<mode>"
|
||||
[(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
|
||||
(unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
|
||||
USADDLV))]
|
||||
"TARGET_SIMD"
|
||||
"<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
|
||||
[(set_attr "type" "neon_reduc_add<q>")]
|
||||
)
|
||||
|
||||
;; ADDV with result zero-extended to SI/DImode (for popcount).
|
||||
(define_insn "aarch64_zero_extend<GPI:mode>_reduc_plus_<VDQV_E:mode>"
|
||||
[(set (match_operand:GPI 0 "register_operand" "=w")
|
||||
|
@ -7077,120 +7077,70 @@ __extension__ extern __inline int16_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_s8 (int8x8_t __a)
|
||||
{
|
||||
int16_t __result;
|
||||
__asm__ ("saddlv %h0,%1.8b"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv8qi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_s16 (int16x4_t __a)
|
||||
{
|
||||
int32_t __result;
|
||||
__asm__ ("saddlv %s0,%1.4h"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv4hi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_u8 (uint8x8_t __a)
|
||||
{
|
||||
uint16_t __result;
|
||||
__asm__ ("uaddlv %h0,%1.8b"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv8qi_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_u16 (uint16x4_t __a)
|
||||
{
|
||||
uint32_t __result;
|
||||
__asm__ ("uaddlv %s0,%1.4h"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv4hi_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_s8 (int8x16_t __a)
|
||||
{
|
||||
int16_t __result;
|
||||
__asm__ ("saddlv %h0,%1.16b"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv16qi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_s16 (int16x8_t __a)
|
||||
{
|
||||
int32_t __result;
|
||||
__asm__ ("saddlv %s0,%1.8h"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv8hi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_s32 (int32x4_t __a)
|
||||
{
|
||||
int64_t __result;
|
||||
__asm__ ("saddlv %d0,%1.4s"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv4si (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_u8 (uint8x16_t __a)
|
||||
{
|
||||
uint16_t __result;
|
||||
__asm__ ("uaddlv %h0,%1.16b"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv16qi_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_u16 (uint16x8_t __a)
|
||||
{
|
||||
uint32_t __result;
|
||||
__asm__ ("uaddlv %s0,%1.8h"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv8hi_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlvq_u32 (uint32x4_t __a)
|
||||
{
|
||||
uint64_t __result;
|
||||
__asm__ ("uaddlv %d0,%1.4s"
|
||||
: "=w"(__result)
|
||||
: "w"(__a)
|
||||
: /* No clobbers */);
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv4si_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x2_t
|
||||
@ -10281,18 +10231,14 @@ __extension__ extern __inline int64_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_s32 (int32x2_t __a)
|
||||
{
|
||||
int64_t __result;
|
||||
__asm__ ("saddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
|
||||
return __result;
|
||||
return __builtin_aarch64_saddlvv2si (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vaddlv_u32 (uint32x2_t __a)
|
||||
{
|
||||
uint64_t __result;
|
||||
__asm__ ("uaddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
|
||||
return __result;
|
||||
return __builtin_aarch64_uaddlvv2si_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4_t
|
||||
|
@ -215,6 +215,9 @@
|
||||
;; Advanced SIMD modes for Integer reduction across lanes (zero/sign extended).
|
||||
(define_mode_iterator VDQV_E [V8QI V16QI V4HI V8HI])
|
||||
|
||||
;; Advanced SIMD modes for Integer widening reduction across lanes.
|
||||
(define_mode_iterator VDQV_L [V8QI V16QI V4HI V8HI V4SI V2SI])
|
||||
|
||||
;; All double integer narrow-able modes.
|
||||
(define_mode_iterator VDN [V4HI V2SI DI])
|
||||
|
||||
@ -492,6 +495,8 @@
|
||||
UNSPEC_FMINV ; Used in aarch64-simd.md.
|
||||
UNSPEC_FADDV ; Used in aarch64-simd.md.
|
||||
UNSPEC_ADDV ; Used in aarch64-simd.md.
|
||||
UNSPEC_SADDLV ; Used in aarch64-simd.md.
|
||||
UNSPEC_UADDLV ; Used in aarch64-simd.md.
|
||||
UNSPEC_SMAXV ; Used in aarch64-simd.md.
|
||||
UNSPEC_SMINV ; Used in aarch64-simd.md.
|
||||
UNSPEC_UMAXV ; Used in aarch64-simd.md.
|
||||
@ -1303,6 +1308,20 @@
|
||||
(V8HI "4s") (V4SI "2d")
|
||||
(V8HF "4s") (V4SF "2d")])
|
||||
|
||||
;; Widened scalar register suffixes.
|
||||
(define_mode_attr Vwstype [(V8QI "h") (V4HI "s")
|
||||
(V2SI "") (V16QI "h")
|
||||
(V8HI "s") (V4SI "d")])
|
||||
;; Add a .1d for V2SI.
|
||||
(define_mode_attr Vwsuf [(V8QI "") (V4HI "")
|
||||
(V2SI ".1d") (V16QI "")
|
||||
(V8HI "") (V4SI "")])
|
||||
|
||||
;; Scalar mode of widened vector reduction.
|
||||
(define_mode_attr VWIDE_S [(V8QI "HI") (V4HI "SI")
|
||||
(V2SI "DI") (V16QI "HI")
|
||||
(V8HI "SI") (V4SI "DI")])
|
||||
|
||||
;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF.
|
||||
(define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s")
|
||||
(V2SI "1d") (V16QI "8h")
|
||||
@ -2184,6 +2203,8 @@
|
||||
|
||||
(define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV])
|
||||
|
||||
(define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV])
|
||||
|
||||
(define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
|
||||
|
||||
(define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD
|
||||
@ -2934,6 +2955,8 @@
|
||||
;; "s" for signed operations and "u" for unsigned ones.
|
||||
(define_int_attr su [(UNSPEC_SADDV "s")
|
||||
(UNSPEC_UADDV "u")
|
||||
(UNSPEC_SADDLV "s")
|
||||
(UNSPEC_UADDLV "u")
|
||||
(UNSPEC_UNPACKSHI "s")
|
||||
(UNSPEC_UNPACKUHI "u")
|
||||
(UNSPEC_UNPACKSLO "s")
|
||||
|
56
gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c
Normal file
56
gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c
Normal file
@ -0,0 +1,56 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define FUNC(IT, OT, S) \
|
||||
OT \
|
||||
foo_##S (IT a) \
|
||||
{ \
|
||||
return vaddlv_##S (a);\
|
||||
}
|
||||
|
||||
FUNC (int8x8_t, int16_t, s8)
|
||||
/* { dg-final { scan-assembler-times {saddlv\th0, v0\.8b} 1} } */
|
||||
|
||||
FUNC (int16x4_t, int32_t, s16)
|
||||
/* { dg-final { scan-assembler-times {saddlv\ts0, v0\.4h} 1} } */
|
||||
|
||||
FUNC (int32x2_t, int64_t, s32)
|
||||
/* { dg-final { scan-assembler-times {saddlp\tv0\.1d, v0\.2s} 1} } */
|
||||
|
||||
FUNC (uint8x8_t, uint16_t, u8)
|
||||
/* { dg-final { scan-assembler-times {uaddlv\th0, v0\.8b} 1} } */
|
||||
|
||||
FUNC (uint16x4_t, uint32_t, u16)
|
||||
/* { dg-final { scan-assembler-times {uaddlv\ts0, v0\.4h} 1} } */
|
||||
|
||||
FUNC (uint32x2_t, uint64_t, u32)
|
||||
/* { dg-final { scan-assembler-times {uaddlp\tv0.1d, v0\.2s} 1} } */
|
||||
|
||||
#define FUNCQ(IT, OT, S) \
|
||||
OT \
|
||||
fooq_##S (IT a) \
|
||||
{ \
|
||||
return vaddlvq_##S (a); \
|
||||
}
|
||||
|
||||
FUNCQ (int8x16_t, int16_t, s8)
|
||||
/* { dg-final { scan-assembler-times {saddlv\th0, v0\.16b} 1} } */
|
||||
|
||||
FUNCQ (int16x8_t, int32_t, s16)
|
||||
/* { dg-final { scan-assembler-times {saddlv\ts0, v0\.8h} 1} } */
|
||||
|
||||
FUNCQ (int32x4_t, int64_t, s32)
|
||||
/* { dg-final { scan-assembler-times {saddlv\td0, v0\.4s} 1} } */
|
||||
|
||||
FUNCQ (uint8x16_t, uint16_t, u8)
|
||||
/* { dg-final { scan-assembler-times {uaddlv\th0, v0\.16b} 1} } */
|
||||
|
||||
FUNCQ (uint16x8_t, uint32_t, u16)
|
||||
/* { dg-final { scan-assembler-times {uaddlv\ts0, v0\.8h} 1} } */
|
||||
|
||||
FUNCQ (uint32x4_t, uint64_t, u32)
|
||||
/* { dg-final { scan-assembler-times {uaddlv\td0, v0\.4s} 1} } */
|
||||
|
Loading…
Reference in New Issue
Block a user