aarch64: Reimplememnt vmovn/vmovl intrinsics with builtins instead
Turns out __builtin_convertvector is not as good a fit for the widening and narrowing intrinsics as I had hoped. During the veclower phase we lower most of it to bitfield operations and hope DCE cleans it back up into vector pack/unpack and extend operations. I received reports that in more complex cases GCC fails to do that and we're left with many vector extract operations that clutter the output. I think veclower can be improved on that front, but for GCC 10 I'd like to just implement these builtins with a good old RTL builtin rather than inline asm. gcc/ * config/aarch64/aarch64-simd.md (aarch64_<su>xtl<mode>): Define. (aarch64_xtn<mode>): Likewise. * config/aarch64/aarch64-simd-builtins.def (sxtl, uxtl, xtn): Define builtins. * config/aarch64/arm_neon.h (vmovl_s8): Reimplement using builtin. (vmovl_s16): Likewise. (vmovl_s32): Likewise. (vmovl_u8): Likewise. (vmovl_u16): Likewise. (vmovl_u32): Likewise. (vmovn_s16): Likewise. (vmovn_s32): Likewise. (vmovn_s64): Likewise. (vmovn_u16): Likewise. (vmovn_u32): Likewise. (vmovn_u64): Likewise.
This commit is contained in:
parent
52cd1cd1b6
commit
48f8d1d48f
@ -171,6 +171,13 @@
|
||||
BUILTIN_VQN (TERNOP, raddhn2, 0, NONE)
|
||||
BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE)
|
||||
|
||||
/* Implemented by aarch64_<us>xtl<mode>. */
|
||||
BUILTIN_VQN (UNOP, sxtl, 0, NONE)
|
||||
BUILTIN_VQN (UNOPU, uxtl, 0, NONE)
|
||||
|
||||
/* Implemented by aarch64_xtn<mode>. */
|
||||
BUILTIN_VQN (UNOP, xtn, 0, NONE)
|
||||
|
||||
BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, ALL)
|
||||
/* Implemented by aarch64_<sur>qmovn<mode>. */
|
||||
BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL)
|
||||
|
@ -7301,6 +7301,20 @@
|
||||
[(set_attr "type" "neon_shift_imm_long")]
|
||||
)
|
||||
|
||||
(define_expand "aarch64_<su>xtl<mode>"
|
||||
[(set (match_operand:VQN 0 "register_operand" "=w")
|
||||
(ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
|
||||
"TARGET_SIMD"
|
||||
""
|
||||
)
|
||||
|
||||
(define_expand "aarch64_xtn<mode>"
|
||||
[(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
|
||||
(truncate:<VNARROWQ> (match_operand:VQN 1 "register_operand" "w")))]
|
||||
"TARGET_SIMD"
|
||||
""
|
||||
)
|
||||
|
||||
;; Truncate a 128-bit integer vector to a 64-bit vector.
|
||||
(define_insn "trunc<mode><Vnarrowq>2"
|
||||
[(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
|
||||
|
@ -8709,42 +8709,42 @@ __extension__ extern __inline int16x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_s8 (int8x8_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int16x8_t);
|
||||
return __builtin_aarch64_sxtlv8hi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_s16 (int16x4_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int32x4_t);
|
||||
return __builtin_aarch64_sxtlv4si (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_s32 (int32x2_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int64x2_t);
|
||||
return __builtin_aarch64_sxtlv2di (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_u8 (uint8x8_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint16x8_t);
|
||||
return __builtin_aarch64_uxtlv8hi_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_u16 (uint16x4_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint32x4_t);
|
||||
return __builtin_aarch64_uxtlv4si_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovl_u32 (uint32x2_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint64x2_t);
|
||||
return __builtin_aarch64_uxtlv2di_uu (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x16_t
|
||||
@ -8796,42 +8796,42 @@ __extension__ extern __inline int8x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_s16 (int16x8_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int8x8_t);
|
||||
return __builtin_aarch64_xtnv8hi (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_s32 (int32x4_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int16x4_t);
|
||||
return __builtin_aarch64_xtnv4si (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_s64 (int64x2_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, int32x2_t);
|
||||
return __builtin_aarch64_xtnv2di (__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint8x8_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_u16 (uint16x8_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint8x8_t);
|
||||
return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x4_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_u32 (uint32x4_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint16x4_t);
|
||||
return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a);
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x2_t
|
||||
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vmovn_u64 (uint64x2_t __a)
|
||||
{
|
||||
return __builtin_convertvector (__a, uint32x2_t);
|
||||
return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a);
|
||||
}
|
||||
|
||||
#define vmull_high_lane_s16(a, b, c) \
|
||||
|
Loading…
Reference in New Issue
Block a user