aarch64: Model zero-high-half semantics of ADDHN/SUBHN instructions

Model the zero-high-half semantics of the narrowing arithmetic Neon
instructions in the aarch64_<sur><addsub>hn<mode> RTL pattern.
Modeling these semantics allows for better RTL combinations while
also removing some register allocation issues as the compiler now
knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>):
	Change to an expander that emits the correct instruction
	depending on endianness.
	(aarch64_<sur><addsub>hn<mode>_insn_le): Define.
	(aarch64_<sur><addsub>hn<mode>_insn_be): Define.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
This commit is contained in:
Jonathan Wright 2021-06-14 16:18:44 +01:00
parent d0889b5d37
commit dbfc149b63
2 changed files with 83 additions and 6 deletions

View File

@ -4661,16 +4661,53 @@
;; <r><addsub>hn<q>.
(define_insn "aarch64_<sur><addsub>hn<mode>"
[(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
(match_operand:VQN 2 "register_operand" "w")]
ADDSUBHN))]
"TARGET_SIMD"
(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
(match_operand:VQN 2 "register_operand" "w")]
ADDSUBHN)
(match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
"TARGET_SIMD && !BYTES_BIG_ENDIAN"
"<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
[(set_attr "type" "neon_<addsub>_halve_narrow_q")]
)
(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
(match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
(match_operand:VQN 2 "register_operand" "w")]
ADDSUBHN)))]
"TARGET_SIMD && BYTES_BIG_ENDIAN"
"<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
[(set_attr "type" "neon_<addsub>_halve_narrow_q")]
)
(define_expand "aarch64_<sur><addsub>hn<mode>"
[(set (match_operand:<VNARROWQ> 0 "register_operand")
(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
(match_operand:VQN 2 "register_operand")]
ADDSUBHN))]
"TARGET_SIMD"
{
rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
if (BYTES_BIG_ENDIAN)
emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
operands[2], CONST0_RTX (<VNARROWQ>mode)));
else
emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
operands[2], CONST0_RTX (<VNARROWQ>mode)));
/* The intrinsic expects a narrow result, so emit a subreg that will get
optimized away as appropriate. */
emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
<VNARROWQ2>mode));
DONE;
}
)
(define_insn "aarch64_<sur><addsub>hn2<mode>_insn_le"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>

View File

@ -74,6 +74,42 @@ TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
#define TEST_ARITH(name, rettype, intype, fs, rs) \
rettype test_ ## name ## _ ## fs ## _zero_high \
(intype a, intype b) \
{ \
return vcombine_ ## rs (name ## _ ## fs (a, b), \
vdup_n_ ## rs (0)); \
}
TEST_ARITH (vaddhn, int8x16_t, int16x8_t, s16, s8)
TEST_ARITH (vaddhn, int16x8_t, int32x4_t, s32, s16)
TEST_ARITH (vaddhn, int32x4_t, int64x2_t, s64, s32)
TEST_ARITH (vaddhn, uint8x16_t, uint16x8_t, u16, u8)
TEST_ARITH (vaddhn, uint16x8_t, uint32x4_t, u32, u16)
TEST_ARITH (vaddhn, uint32x4_t, uint64x2_t, u64, u32)
TEST_ARITH (vraddhn, int8x16_t, int16x8_t, s16, s8)
TEST_ARITH (vraddhn, int16x8_t, int32x4_t, s32, s16)
TEST_ARITH (vraddhn, int32x4_t, int64x2_t, s64, s32)
TEST_ARITH (vraddhn, uint8x16_t, uint16x8_t, u16, u8)
TEST_ARITH (vraddhn, uint16x8_t, uint32x4_t, u32, u16)
TEST_ARITH (vraddhn, uint32x4_t, uint64x2_t, u64, u32)
TEST_ARITH (vsubhn, int8x16_t, int16x8_t, s16, s8)
TEST_ARITH (vsubhn, int16x8_t, int32x4_t, s32, s16)
TEST_ARITH (vsubhn, int32x4_t, int64x2_t, s64, s32)
TEST_ARITH (vsubhn, uint8x16_t, uint16x8_t, u16, u8)
TEST_ARITH (vsubhn, uint16x8_t, uint32x4_t, u32, u16)
TEST_ARITH (vsubhn, uint32x4_t, uint64x2_t, u64, u32)
TEST_ARITH (vrsubhn, int8x16_t, int16x8_t, s16, s8)
TEST_ARITH (vrsubhn, int16x8_t, int32x4_t, s32, s16)
TEST_ARITH (vrsubhn, int32x4_t, int64x2_t, s64, s32)
TEST_ARITH (vrsubhn, uint8x16_t, uint16x8_t, u16, u8)
TEST_ARITH (vrsubhn, uint16x8_t, uint32x4_t, u32, u16)
TEST_ARITH (vrsubhn, uint32x4_t, uint64x2_t, u64, u32)
/* { dg-final { scan-assembler-not "dup\\t" } } */
/* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} } */
@ -88,3 +124,7 @@ TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
/* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} } */
/* { dg-final { scan-assembler-times "\\taddhn\\tv" 6} } */
/* { dg-final { scan-assembler-times "\\tsubhn\\tv" 6} } */
/* { dg-final { scan-assembler-times "\\trsubhn\\tv" 6} } */
/* { dg-final { scan-assembler-times "\\traddhn\\tv" 6} } */