aarch64: Add backend support for expanding __builtin_memset

This patch implements aarch64 backend expansion for __builtin_memset. Most of the implementation is based on the expansion of __builtin_memcpy. We change the values of SET_RATIO and MOVE_RATIO for cases where we do not have to strictly align and where we can benefit from NEON instructions in the backend. gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_expand_setmem): New declaration. * config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case for E_V16QImode. (aarch64_set_one_block_and_progress_pointer): New helper for aarch64_expand_setmem. (aarch64_expand_setmem): Define the expansion for memset. * config/aarch64/aarch64.h (CLEAR_RATIO): Tweak to favor aarch64_expand_setmem when allowed and profitable. (SET_RATIO): Likewise. * config/aarch64/aarch64.md: Define pattern for setmemdi. gcc/testsuite/ChangeLog: * g++.dg/tree-ssa/pr90883.C: Remove xfail for aarch64. * gcc.dg/tree-prof/stringop-2.c: Add xfail for aarch64. * gcc.target/aarch64/memset-corner-cases.c: New test. * gcc.target/aarch64/memset-q-reg.c: New test.
2020-11-13 10:48:27 +00:00 · 2020-11-13 10:48:27 +00:00 · 54bbde550e
commit 54bbde550e
parent 5e28fca09c
8 changed files with 334 additions and 11 deletions
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@ -510,6 +510,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -7030,6 +7030,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
    case E_V4SImode:
      return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);

+    case E_V16QImode:
+      return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
+
    default:
      gcc_unreachable ();
    }
@ -21276,6 +21279,135 @@ aarch64_expand_cpymem (rtx *operands)
  return true;
 }

+/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
+   SRC is a register we have created with the duplicated value to be set.  */
+static void
+aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
+					    machine_mode mode)
+{
+  /* If we are copying 128bits or 256bits, we can do that straight from
+     the SIMD register we prepared.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = GET_MODE (src);
+      /* "Cast" the *dst to the correct mode.  */
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memset.  */
+      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
+					 aarch64_progress_pointer (*dst), src));
+
+      /* Move the pointers forward.  */
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+    {
+      /* "Cast" the *dst to the correct mode.  */
+      *dst = adjust_address (*dst, GET_MODE (src), 0);
+      /* Emit the memset.  */
+      emit_move_insn (*dst, src);
+      /* Move the pointers forward.  */
+      *dst = aarch64_move_pointer (*dst, 16);
+      return;
+    }
+  /* For copying less, we have to extract the right amount from src.  */
+  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
+
+  /* "Cast" the *dst to the correct mode.  */
+  *dst = adjust_address (*dst, mode, 0);
+  /* Emit the memset.  */
+  emit_move_insn (*dst, reg);
+  /* Move the pointer forward.  */
+  *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand setmem, as if from a __builtin_memset.  Return true if
+   we succeed, otherwise return false.  */
+
+bool
+aarch64_expand_setmem (rtx *operands)
+{
+  int n, mode_bits;
+  unsigned HOST_WIDE_INT len;
+  rtx dst = operands[0];
+  rtx val = operands[2], src;
+  rtx base;
+  machine_mode cur_mode = BLKmode, next_mode;
+
+  /* We can't do anything smart if the amount to copy is not constant.  */
+  if (!CONST_INT_P (operands[1]))
+    return false;
+
+  bool speed_p = !optimize_function_for_size_p (cfun);
+
+  /* Default the maximum to 256-bytes.  */
+  unsigned max_set_size = 256;
+
+  /* In case we are optimizing for size or if the core does not
+     want to use STP Q regs, lower the max_set_size.  */
+  max_set_size = (!speed_p
+		  || (aarch64_tune_params.extra_tuning_flags
+		      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+		  ? max_set_size / 2 : max_set_size;
+
+  len = INTVAL (operands[1]);
+
+  /* Upper bound check.  */
+  if (len > max_set_size)
+    return false;
+
+  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+  /* Prepare the val using a DUP/MOVI v0.16B, val.  */
+  src = expand_vector_broadcast (V16QImode, val);
+  src = force_reg (V16QImode, src);
+
+  /* Convert len to bits to make the rest of the code simpler.  */
+  n = len * BITS_PER_UNIT;
+
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
+     pattern is only turned on for TARGET_SIMD.  */
+  const int copy_limit = (speed_p
+			  && (aarch64_tune_params.extra_tuning_flags
+			      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+			  ? GET_MODE_BITSIZE (TImode) : 256;
+
+  while (n > 0)
+    {
+      /* Find the largest mode in which to do the copy without
+	 over writing.  */
+      opt_scalar_int_mode mode_iter;
+      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	  cur_mode = mode_iter.require ();
+
+      gcc_assert (cur_mode != BLKmode);
+
+      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
+
+      n -= mode_bits;
+
+      /* Do certain trailing copies as overlapping if it's going to be
+	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
+	 byte copy it's more efficient to do two overlapping 8 byte copies than
+	 8 + 4 + 2 + 1.  */
+      if (n > 0 && n < copy_limit / 2)
+	{
+	  next_mode = smallest_mode_for_size (n, MODE_INT);
+	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
+	  gcc_assert (n_bits <= mode_bits);
+	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
+	  n = n_bits;
+	}
+    }
+
+  return true;
+}
+
+
 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
   SImode stores.  Handle the case when the constant has identical
   bottom and top halves.  This is beneficial when the two stores can be
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@ -1024,16 +1024,19 @@ typedef struct
 #define MOVE_RATIO(speed) \
  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))

-/* For CLEAR_RATIO, when optimizing for size, give a better estimate
-   of the length of a memset call, but use the default otherwise.  */
+/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
+   we would use more than 3 scalar instructions.
+   Otherwise follow a sensible default: when optimizing for size, give a better
+   estimate of the length of a memset call, but use the default otherwise.  */
 #define CLEAR_RATIO(speed) \
-  ((speed) ? 15 : AARCH64_CALL_RATIO)
+  (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)

-/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant, so when
-   optimizing for size adjust the ratio to account for the overhead of loading
-   the constant.  */
+/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant.  Without
+   -mstrict-align, make decisions in "setmem".  Otherwise follow a sensible
+   default: when optimizing for size adjust the ratio to account for the
+   overhead of loading the constant.  */
 #define SET_RATIO(speed) \
-  ((speed) ? 15 : AARCH64_CALL_RATIO - 2)
+  (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)

 /* Disable auto-increment in move_by_pieces et al.  Use of auto-increment is
   rarely a good idea in straight-line code since it adds an extra address
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@ -1571,6 +1571,24 @@
 }
 )

+;; 0 is dst
+;; 1 is val
+;; 2 is size of copy in bytes
+;; 3 is alignment
+
+(define_expand "setmemdi"
+  [(set (match_operand:BLK 0 "memory_operand")     ;; Dest
+        (match_operand:QI  2 "nonmemory_operand")) ;; Value
+   (use (match_operand:DI  1 "immediate_operand")) ;; Length
+   (match_operand          3 "immediate_operand")] ;; Align
+  "TARGET_SIMD"
+{
+  if (aarch64_expand_setmem (operands))
+    DONE;
+
+  FAIL;
+})
+
 ;; Operands 1 and 3 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "load_pair_sw_<SX:mode><SX2:mode>"
--- a/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
@ -15,6 +15,6 @@

 // We want to match enough here to capture that we deleted an empty
 // constructor store
-// aarch64 and mips will expand to loop to clear because CLEAR_RATIO.
-// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } }
+// mips will expand to loop to clear because CLEAR_RATIO.
+// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { mips*-*-* } } } }

--- a/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/stringop-2.c
@ -20,6 +20,6 @@ main()
   return 0;
 }
 /* autofdo doesn't support value profiling for now: */
-/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile"} } */
+/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile" { target { ! aarch64*-*-* } } } } */
 /* The versioned memset of size 4 should be optimized to an assignment.
-   { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" } } */
+   { dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" { target { ! aarch64*-*-* } } } } */
--- a/gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
+++ b/gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/* One byte variable set should be scalar
+**set1byte:
+**	strb	w1, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set1byte (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 1);
+}
+
+/* Special cases for setting 0.  */
+/* 1-byte should be STRB with wzr
+**set0byte:
+**	strb	wzr, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0byte (int64_t *src)
+{
+  __builtin_memset (src, 0, 1);
+}
+
+/* 35bytes would become 4 scalar instructions.  So favour NEON.
+**set0neon:
+**	movi	v0.4s, 0
+**	stp	q0, q0, \[x0\]
+**	str	wzr, \[x0, 31\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0neon (int64_t *src)
+{
+  __builtin_memset (src, 0, 35);
+}
+
+/* 36bytes should be scalar however.
+**set0scalar:
+**	stp	xzr, xzr, \[x0\]
+**	stp	xzr, xzr, \[x0, 16\]
+**	str	wzr, \[x0, 32\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set0scalar (int64_t *src)
+{
+  __builtin_memset (src, 0, 36);
+}
+
+
+/* 256-bytes expanded
+**set256byte:
+**	dup	v0.16b, w1
+**	stp	q0, q0, \[x0\]
+**	stp	q0, q0, \[x0, 32\]
+**	stp	q0, q0, \[x0, 64\]
+**	stp	q0, q0, \[x0, 96\]
+**	stp	q0, q0, \[x0, 128\]
+**	stp	q0, q0, \[x0, 160\]
+**	stp	q0, q0, \[x0, 192\]
+**	stp	q0, q0, \[x0, 224\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256byte (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 256);
+}
+
+/* More than 256 bytes goes to memset
+**set257byte:
+**	mov	x2, 257
+**	mov	w1, 99
+**	b	memset
+*/
+void __attribute__((__noinline__))
+set257byte (int64_t *src)
+{
+  __builtin_memset (src, 'c', 257);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
--- a/gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
+++ b/gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+/*
+**set128bits:
+**	dup	v0.16b, w1
+**	str	q0, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bits (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 2*sizeof(int64_t));
+}
+
+/*
+**set128bitszero:
+**	stp	xzr, xzr, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bitszero (int64_t *src)
+{
+  __builtin_memset (src, 0, 2*sizeof(int64_t));
+}
+
+/*
+** set128bitsplus:
+**	dup	v0.16b, w1
+**	str	q0, \[x0\]
+**	str	q0, \[x0, 12\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set128bitsplus (int64_t *src, char c)
+{
+  __builtin_memset (src, c, 7*sizeof(int32_t));
+}
+
+/*
+** set256bits:
+**	movi	v0.16b, 0x63
+**	stp	q0, q0, \[x0\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bits (int64_t *src)
+{
+  __builtin_memset (src, 'c', 4*sizeof(int64_t));
+}
+
+/*
+**set256bitszero:
+**	stp	xzr, xzr, \[x0\]
+**	stp	xzr, xzr, \[x0, 16\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bitszero (int64_t *src)
+{
+  __builtin_memset (src, 0, 4*sizeof(int64_t));
+}
+
+/*
+** set256bitsplus:
+**	movi	v0.16b, 0x63
+**	stp	q0, q0, \[x0\]
+**	str	q0, \[x0, 32\]
+**	str	d0, \[x0, 48\]
+**	ret
+*/
+void __attribute__((__noinline__))
+set256bitsplus (int64_t *src)
+{
+  __builtin_memset (src, 'c', 7*sizeof(int64_t));
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */