[AArch64] Optimise aarch64_add_offset for SVE VL constants

aarch64_add_offset contains code to decompose all SVE VL-based constants into native operations. The worst-case fallback is to load the number of SVE elements into a register and use a general multiplication. This patch improves that fallback by reusing expand_mult if can_create_pseudo_p, rather than emitting a MULT pattern directly. In order to increase the chances of being able to use a simple add-and-shift, the patch also tries to compute VG * the lowest set bit of the multiplier, rather than always using CNTD as the basis for the multiplication path. This is tested by the ACLE patches but is really an independent improvement. 2019-08-15 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_add_offset): In the fallback multiplication case, try to compute VG * (lowest set bit) directly rather than always basing the multiplication on VG. Use expand_mult for the multiplication if we can. gcc/testsuite/ * gcc.target/aarch64/sve/loop_add_4.c: Expect 10 INCWs and INCDs rather than 8. From-SVN: r274519
2019-08-15 08:50:00 +00:00 · 2019-08-15 08:50:00 +00:00 · 7d8bdfa7e4
commit 7d8bdfa7e4
parent 0fdc30bcf5
4 changed files with 43 additions and 12 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,10 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_add_offset): In the fallback
+	multiplication case, try to compute VG * (lowest set bit) directly
+	rather than always basing the multiplication on VG.  Use
+	expand_mult for the multiplication if we can.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>

 	* config/aarch64/aarch64-protos.h
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -73,6 +73,7 @@
 #include "selftest-rtl.h"
 #include "rtx-vector-builder.h"
 #include "intl.h"
+#include "expmed.h"

 /* This file should be included last.  */
 #include "target-def.h"
@ -3465,20 +3466,36 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 	}
      else
 	{
-	  /* Use CNTD, then multiply it by FACTOR.  */
-	  val = gen_int_mode (poly_int64 (2, 2), mode);
+	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
+	     directly, since that should increase the chances of being
+	     able to use a shift and add sequence.  If LOW_BIT itself
+	     is out of range, just use CNTD.  */
+	  if (low_bit <= 16 * 8)
+	    factor /= low_bit;
+	  else
+	    low_bit = 1;
+
+	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
 	  val = aarch64_force_temporary (mode, temp1, val);

-	  /* Go back to using a negative multiplication factor if we have
-	     no register from which to subtract.  */
-	  if (code == MINUS && src == const0_rtx)
+	  if (can_create_pseudo_p ())
 	    {
-	      factor = -factor;
-	      code = PLUS;
+	      rtx coeff1 = gen_int_mode (factor, mode);
+	      val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
+	    }
+	  else
+	    {
+	      /* Go back to using a negative multiplication factor if we have
+		 no register from which to subtract.  */
+	      if (code == MINUS && src == const0_rtx)
+		{
+		  factor = -factor;
+		  code = PLUS;
+		}
+	      rtx coeff1 = gen_int_mode (factor, mode);
+	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
+	      val = gen_rtx_MULT (mode, val, coeff1);
 	    }
-	  rtx coeff1 = gen_int_mode (factor, mode);
-	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
-	  val = gen_rtx_MULT (mode, val, coeff1);
 	}

      if (shift > 0)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,8 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/loop_add_4.c: Expect 10 INCWs and
+	INCDs rather than 8.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>

 	* gcc.target/aarch64/sve/revb_1.c: Restrict to little-endian targets.
--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
@ -68,7 +68,8 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
+/* 2 for the calculations of -17 and 17.  */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */

 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@ -85,7 +86,8 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
+/* 2 for the calculations of -17 and 17.  */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */

 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */