[AArch64] Canonicalise SVE predicate constants

This patch makes sure that we build all SVE predicate constants as VNx16BI before RA, to encourage similar constants to be reused between modes. This is also useful for the ACLE, where the single predicate type svbool_t is always a VNx16BI. Also, and again to encourage reuse, the patch makes us use a .B PTRUE for all ptrue-predicated operations, rather than (for example) using a .S PTRUE for 32-bit operations and a .D PTRUE for 64-bit operations. The only current case in which a .H, .S or .D operation needs to be predicated by a "strict" .H/.S/.D PTRUE is the PTEST in a conditional branch, which an earlier patch fixed to use an appropriate VNx16BI constant. 2019-08-14 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_target_reg): New function. (aarch64_emit_set_immediate): Likewise. (aarch64_ptrue_reg): Build a VNx16BI constant and then bitcast it. (aarch64_pfalse_reg): Likewise. (aarch64_convert_sve_data_to_pred): New function. (aarch64_sve_move_pred_via_while): Take an optional target register and the required register mode. (aarch64_expand_sve_const_pred_1): New function. (aarch64_expand_sve_const_pred): Likewise. (aarch64_expand_mov_immediate): Build an all-true predicate if the significant bits of the immediate are all true. Use aarch64_expand_sve_const_pred for all compile-time predicate constants. (aarch64_mov_operand_p): Force predicate constants to be VNx16BI before register allocation. * config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg): Use a VNx16BI PTRUE when splitting the memory alternative. (vec_duplicate<mode>): Update accordingly. (*pred_cmp<cmp_op><mode>): Rename to... (@aarch64_pred_cmp<cmp_op><mode>): ...this. gcc/testsuite/ * gcc.target/aarch64/sve/spill_4.c: Expect all ptrues to be .Bs. * gcc.target/aarch64/sve/single_1.c: Likewise. * gcc.target/aarch64/sve/single_2.c: Likewise. * gcc.target/aarch64/sve/single_3.c: Likewise. * gcc.target/aarch64/sve/single_4.c: Likewise. From-SVN: r274415
2019-08-14 08:03:26 +00:00 · 2019-08-14 08:03:26 +00:00 · 678faefcab
parent 3446728963
commit 678faefcab
9 changed files with 204 additions and 46 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,25 @@
+2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_target_reg): New function.
+	(aarch64_emit_set_immediate): Likewise.
+	(aarch64_ptrue_reg): Build a VNx16BI constant and then bitcast it.
+	(aarch64_pfalse_reg): Likewise.
+	(aarch64_convert_sve_data_to_pred): New function.
+	(aarch64_sve_move_pred_via_while): Take an optional target register
+	and the required register mode.
+	(aarch64_expand_sve_const_pred_1): New function.
+	(aarch64_expand_sve_const_pred): Likewise.
+	(aarch64_expand_mov_immediate): Build an all-true predicate
+	if the significant bits of the immediate are all true.  Use
+	aarch64_expand_sve_const_pred for all compile-time predicate constants.
+	(aarch64_mov_operand_p): Force predicate constants to be VNx16BI
+	before register allocation.
+	* config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg): Use
+	a VNx16BI PTRUE when splitting the memory alternative.
+	(vec_duplicate<mode>): Update accordingly.
+	(*pred_cmp<cmp_op><mode>): Rename to...
+	(@aarch64_pred_cmp<cmp_op><mode>): ...this.
+
 2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>

 	* config/aarch64/aarch64-protos.h (aarch64_ptrue_all): Declare.
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@ -846,7 +846,7 @@
    [(set (match_operand:SVE_ALL 0 "register_operand")
 	  (vec_duplicate:SVE_ALL
 	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
-     (clobber (scratch:<VPRED>))])]
+     (clobber (scratch:VNx16BI))])]
  "TARGET_SVE"
  {
    if (MEM_P (operands[1]))
@ -867,7 +867,7 @@
  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
 	(vec_duplicate:SVE_ALL
 	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
-   (clobber (match_scratch:<VPRED> 2 "=X, X, Upl"))]
+   (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
  "TARGET_SVE"
  "@
   mov\t%0.<Vetype>, %<vwcore>1
@ -877,9 +877,10 @@
  [(const_int 0)]
  {
    if (GET_CODE (operands[2]) == SCRATCH)
-      operands[2] = gen_reg_rtx (<VPRED>mode);
-    emit_move_insn (operands[2], CONSTM1_RTX (<VPRED>mode));
-    emit_insn (gen_sve_ld1r<mode> (operands[0], operands[2], operands[1],
+      operands[2] = gen_reg_rtx (VNx16BImode);
+    emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
+    rtx gp = gen_lowpart (<VPRED>mode, operands[2]);
+    emit_insn (gen_sve_ld1r<mode> (operands[0], gp, operands[1],
 				   CONST0_RTX (<MODE>mode)));
    DONE;
  }
@ -2971,7 +2972,7 @@
 )

 ;; Predicated integer comparisons.
-(define_insn "*pred_cmp<cmp_op><mode>"
+(define_insn "@aarch64_pred_cmp<cmp_op><mode>"
  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
 	(and:<VPRED>
 	  (SVE_INT_CMP:<VPRED>
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -2546,6 +2546,36 @@ aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
 }
 			      

+/* Return TARGET if it is nonnull and a register of mode MODE.
+   Otherwise, return a fresh register of mode MODE if we can,
+   or TARGET reinterpreted as MODE if we can't.  */
+
+static rtx
+aarch64_target_reg (rtx target, machine_mode mode)
+{
+  if (target && REG_P (target) && GET_MODE (target) == mode)
+    return target;
+  if (!can_create_pseudo_p ())
+    {
+      gcc_assert (target);
+      return gen_lowpart (mode, target);
+    }
+  return gen_reg_rtx (mode);
+}
+
+/* Return a register that contains the constant in BUILDER, given that
+   the constant is a legitimate move operand.  Use TARGET as the register
+   if it is nonnull and convenient.  */
+
+static rtx
+aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
+{
+  rtx src = builder.build ();
+  target = aarch64_target_reg (target, GET_MODE (src));
+  emit_insn (gen_rtx_SET (target, src));
+  return target;
+}
+
 static rtx
 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
 {
@ -2721,7 +2751,8 @@ rtx
 aarch64_ptrue_reg (machine_mode mode)
 {
  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
-  return force_reg (mode, CONSTM1_RTX (mode));
+  rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
 }

 /* Return an all-false predicate register of mode MODE.  */
@ -2730,7 +2761,26 @@ rtx
 aarch64_pfalse_reg (machine_mode mode)
 {
  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
-  return force_reg (mode, CONST0_RTX (mode));
+  rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
+}
+
+/* Use a comparison to convert integer vector SRC into MODE, which is
+   the corresponding SVE predicate mode.  Use TARGET for the result
+   if it's nonnull and convenient.  */
+
+static rtx
+aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
+{
+  machine_mode src_mode = GET_MODE (src);
+  insn_code icode = code_for_aarch64_pred_cmp (NE, src_mode);
+  expand_operand ops[4];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], CONSTM1_RTX (mode), mode);
+  create_input_operand (&ops[2], src, src_mode);
+  create_input_operand (&ops[3], CONST0_RTX (src_mode), src_mode);
+  expand_insn (icode, 4, ops);
+  return ops[0].value;
 }

 /* Return true if we can move VALUE into a register using a single
@ -3633,15 +3683,80 @@ aarch64_expand_sve_const_vector (rtx target, rtx src)
  return target;
 }

-/* Use WHILE to set predicate register DEST so that the first VL bits
-   are set and the rest are clear.  */
+/* Use WHILE to set a predicate register of mode MODE in which the first
+   VL bits are set and the rest are clear.  Use TARGET for the register
+   if it's nonnull and convenient.  */

-static void
-aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl)
+static rtx
+aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
+				 unsigned int vl)
 {
  rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
-  emit_insn (gen_while_ult (DImode, GET_MODE (dest),
-			    dest, const0_rtx, limit));
+  target = aarch64_target_reg (target, mode);
+  emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
+  return target;
+}
+
+/* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
+   constant in BUILDER into an SVE predicate register.  Return the register
+   on success, otherwise return null.  Use TARGET for the register if
+   nonnull and convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
+{
+  if (builder.encoded_nelts () == 1)
+    /* A PFALSE or a PTRUE .B ALL.  */
+    return aarch64_emit_set_immediate (target, builder);
+
+  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
+    {
+      /* If we can load the constant using PTRUE, use it as-is.  */
+      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
+      if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
+	return aarch64_emit_set_immediate (target, builder);
+
+      /* Otherwise use WHILE to set the first VL bits.  */
+      return aarch64_sve_move_pred_via_while (target, mode, vl);
+    }
+
+  return NULL_RTX;
+}
+
+/* Return an SVE predicate register that contains the VNx16BImode
+   constant in BUILDER, without going through the move expanders.
+
+   The returned register can have whatever mode seems most natural
+   given the contents of BUILDER.  Use TARGET for the result if
+   convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
+{
+  /* Try loading the constant using pure predicate operations.  */
+  if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder))
+    return res;
+
+  /* Try forcing the constant to memory.  */
+  if (builder.full_nelts ().is_constant ())
+    if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
+      {
+	target = aarch64_target_reg (target, VNx16BImode);
+	emit_move_insn (target, mem);
+	return target;
+      }
+
+  /* The last resort is to load the constant as an integer and then
+     compare it against zero.  Use -1 for set bits in order to increase
+     the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
+  rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
+				  builder.nelts_per_pattern ());
+  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+    int_builder.quick_push (INTVAL (builder.elt (i))
+			    ? constm1_rtx : const0_rtx);
+  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
+					   int_builder.build ());
 }

 /* Set DEST to immediate IMM.  */
@ -3770,6 +3885,32 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)

  if (!CONST_INT_P (imm))
    {
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+	{
+	  /* Only the low bit of each .H, .S and .D element is defined,
+	     so we can set the upper bits to whatever we like.  If the
+	     predicate is all-true in MODE, prefer to set all the undefined
+	     bits as well, so that we can share a single .B predicate for
+	     all modes.  */
+	  if (imm == CONSTM1_RTX (mode))
+	    imm = CONSTM1_RTX (VNx16BImode);
+
+	  /* All methods for constructing predicate modes wider than VNx16BI
+	     will set the upper bits of each element to zero.  Expose this
+	     by moving such constants as a VNx16BI, so that all bits are
+	     significant and so that constants for different modes can be
+	     shared.  The wider constant will still be available as a
+	     REG_EQUAL note.  */
+	  rtx_vector_builder builder;
+	  if (aarch64_get_sve_pred_bits (builder, imm))
+	    {
+	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
+	      if (dest != res)
+		emit_move_insn (dest, gen_lowpart (mode, res));
+	      return;
+	    }
+	}
+
      if (GET_CODE (imm) == HIGH
 	  || aarch64_simd_valid_immediate (imm, NULL))
 	{
@ -3777,19 +3918,6 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	  return;
 	}

-      rtx_vector_builder builder;
-      if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL
-	  && aarch64_get_sve_pred_bits (builder, imm))
-	{
-	  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
-	  int vl = aarch64_partial_ptrue_length (builder, elt_size);
-	  if (vl > 0)
-	    {
-	      aarch64_sve_move_pred_via_while (dest, vl);
-	      return;
-	    }
-	}
-
      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
 	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
 	  {
@ -15178,7 +15306,17 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
    return true;

  if (VECTOR_MODE_P (GET_MODE (x)))
-    return aarch64_simd_valid_immediate (x, NULL);
+    {
+      /* Require predicate constants to be VNx16BI before RA, so that we
+	 force everything to have a canonical form.  */
+      if (!lra_in_progress
+	  && !reload_completed
+	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
+	  && GET_MODE (x) != VNx16BImode)
+	return false;
+
+      return aarch64_simd_valid_immediate (x, NULL);
+    }

  if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
    return true;
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,11 @@
+2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/spill_4.c: Expect all ptrues to be .Bs.
+	* gcc.target/aarch64/sve/single_1.c: Likewise.
+	* gcc.target/aarch64/sve/single_2.c: Likewise.
+	* gcc.target/aarch64/sve/single_3.c: Likewise.
+	* gcc.target/aarch64/sve/single_4.c: Likewise.
+
 2019-08-13  Steven G. Kargl  <kargl@gcc.gnu.org>

 	PR fortran/87991
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
@ -40,10 +40,7 @@ TEST_LOOP (double, 3.0)
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */

-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl8\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 } } */

 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */

-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl8\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 11 } } */

 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */

-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl16\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 11 } } */

 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */

-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl128\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl32\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 11 } } */

 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
@ -24,9 +24,10 @@ TEST_LOOP (uint16_t, 0x1234);
 TEST_LOOP (uint32_t, 0x12345);
 TEST_LOOP (uint64_t, 0x123456);

+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.b,} 6 } } */
 /* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.s,} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.d,} } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */