AArch64: Fix left fold sum reduction RTL patterns [PR104049]

As the discussion in the PR pointed out the RTL we have for the REDUC_PLUS patterns are wrong. The UNSPECs are modelled as returning a vector and then in an expand pattern we emit a vec_select of the 0th element to get the scalar. This is incorrect as the instruction itself already only returns a single scalar and by declaring it returns a vector it allows combine to push in a subreg into the pattern, which causes reload to make duplicate moves. This patch corrects this by removing the weird indirection and making the RTL pattern model the correct semantics of the instruction immediately. gcc/ChangeLog: PR target/104049 * config/aarch64/aarch64-simd.md (aarch64_reduc_plus_internal<mode>): Fix RTL and rename to... (reduc_plus_scal_<mode>): ... This. (reduc_plus_scal_v4sf): Moved. (aarch64_reduc_plus_internalv2si): Fix RTL and rename to... (reduc_plus_scal_v2si): ... This. gcc/testsuite/ChangeLog: PR target/104049 * gcc.target/aarch64/vadd_reduc-1.c: New test. * gcc.target/aarch64/vadd_reduc-2.c: New test.
2022-04-07 08:27:53 +01:00 · 2022-04-07 08:27:53 +01:00 · 024edf0895
parent fdd81afcf1
commit 024edf0895
3 changed files with 102 additions and 60 deletions
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -3385,20 +3385,6 @@

 ;; 'across lanes' add.

-(define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
-	       UNSPEC_ADDV)]
-  "TARGET_SIMD"
-  {
-    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
-    rtx scratch = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
-    DONE;
-  }
-)
-
 (define_insn "aarch64_faddp<mode>"
 [(set (match_operand:VHSDF 0 "register_operand" "=w")
       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
@ -3409,15 +3395,58 @@
  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )

-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
-       (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
 		    UNSPEC_ADDV))]
 "TARGET_SIMD"
 "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
  [(set_attr "type" "neon_reduc_add<q>")]
 )

+(define_insn "reduc_plus_scal_v2si"
+ [(set (match_operand:SI 0 "register_operand" "=w")
+       (unspec:SI [(match_operand:V2SI 1 "register_operand" "w")]
+		    UNSPEC_ADDV))]
+ "TARGET_SIMD"
+ "addp\\t%0.2s, %1.2s, %1.2s"
+  [(set_attr "type" "neon_reduc_add")]
+)
+
+;; ADDV with result zero-extended to SI/DImode (for popcount).
+(define_insn "aarch64_zero_extend<GPI:mode>_reduc_plus_<VDQV_E:mode>"
+ [(set (match_operand:GPI 0 "register_operand" "=w")
+       (zero_extend:GPI
+	(unspec:<VDQV_E:VEL> [(match_operand:VDQV_E 1 "register_operand" "w")]
+			     UNSPEC_ADDV)))]
+ "TARGET_SIMD"
+ "add<VDQV_E:vp>\\t%<VDQV_E:Vetype>0, %1.<VDQV_E:Vtype>"
+  [(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
+)
+
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
+		   UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\\t%<Vetype>0, %1.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
+(define_expand "reduc_plus_scal_v4sf"
+ [(set (match_operand:SF 0 "register_operand")
+       (unspec:SF [(match_operand:V4SF 1 "register_operand")]
+		    UNSPEC_FADDV))]
+ "TARGET_SIMD"
+{
+  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
+  rtx scratch = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
+  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
+  DONE;
+})
+
 (define_insn "aarch64_<su>addlv<mode>"
 [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
       (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
@ -3436,49 +3465,6 @@
  [(set_attr "type" "neon_reduc_add<q>")]
 )

-;; ADDV with result zero-extended to SI/DImode (for popcount).
-(define_insn "aarch64_zero_extend<GPI:mode>_reduc_plus_<VDQV_E:mode>"
- [(set (match_operand:GPI 0 "register_operand" "=w")
-       (zero_extend:GPI
-	(unspec:<VDQV_E:VEL> [(match_operand:VDQV_E 1 "register_operand" "w")]
-			     UNSPEC_ADDV)))]
- "TARGET_SIMD"
- "add<VDQV_E:vp>\\t%<VDQV_E:Vetype>0, %1.<VDQV_E:Vtype>"
-  [(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
-)
-
-(define_insn "aarch64_reduc_plus_internalv2si"
- [(set (match_operand:V2SI 0 "register_operand" "=w")
-       (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
-		    UNSPEC_ADDV))]
- "TARGET_SIMD"
- "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
-)
-
-(define_insn "reduc_plus_scal_<mode>"
- [(set (match_operand:<VEL> 0 "register_operand" "=w")
-       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
-		   UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%<Vetype>0, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
-)
-
-(define_expand "reduc_plus_scal_v4sf"
- [(set (match_operand:SF 0 "register_operand")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
-{
-  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
-  rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
-  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
-  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
-  DONE;
-})
-
 (define_insn "clrsb<mode>2"
  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
        (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
--- a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**bar:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
--- a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+**test:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	and	w1, w0, 65535
+**	add	w0, w1, w0, lsr 16
+**	lsr	w0, w0, 1
+**	ret
+*/
+int test (uint8_t *p, uint32_t t[1][1], int n) {
+
+  int sum = 0;
+  uint32_t a0;
+  for (int i = 0; i < 4; i++, p++)
+    t[i][0] = p[0];
+
+  for (int i = 0; i < 4; i++) {
+    {
+      int t0 = t[0][i] + t[0][i];
+      a0 = t0;
+    };
+    sum += a0;
+  }
+  return (((uint16_t)sum) + ((uint32_t)sum >> 16)) >> 1;
+}