aarch64: Add support for Armv8.8-a memory operations and memcpy expansion

This patch adds the +mops architecture extension flag from the 2021 Arm Architecture extensions, Armv8.8-a. The +mops extensions introduce instructions to accelerate the memcpy, memset, memmove standard functions. The first patch here uses the instructions in the inline memcpy expansion. Further patches in the series will use similar instructions to inline memmove and memset. A new param, aarch64-mops-memcpy-size-threshold, is introduced to control the size threshold above which to emit the new sequence. Its default setting is 256 bytes, which is the same as the current threshold above which we'd emit a libcall. Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64-option-extensions.def (mops): Define. * config/aarch64/aarch64.c (aarch64_expand_cpymem_mops): Define. (aarch64_expand_cpymem): Define. * config/aarch64/aarch64.h (AARCH64_FL_MOPS): Define. (AARCH64_ISA_MOPS): Define. (TARGET_MOPS): Define. (MOVE_RATIO): Adjust for TARGET_MOPS. * config/aarch64/aarch64.md ("unspec"): Add UNSPEC_CPYMEM. (aarch64_cpymemdi): New pattern. (cpymemdi): Adjust for TARGET_MOPS. * config/aarch64/aarch64.opt (aarch64-mops-memcpy-size-threshol): New param. * doc/invoke.texi (AArch64 Options): Document +mops. gcc/testsuite/ChangeLog: * gcc.target/aarch64/mops_1.c: New test.
2021-12-13 14:11:59 +00:00 · 2021-12-13 14:11:59 +00:00 · 0caf592d6a
parent 9eb8785b3f
commit 0caf592d6a
7 changed files with 149 additions and 17 deletions
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@ -235,4 +235,7 @@ AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg")
 /* Enabling/Disabling "ls64" only changes "ls64".  */
 AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "")

+/* Enabling/disabling "mops" only changes "mops".  */
+AARCH64_OPT_EXTENSION("mops", AARCH64_FL_MOPS, 0, 0, false, "")
+
 #undef AARCH64_OPT_EXTENSION
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -23568,6 +23568,28 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
  *dst = aarch64_progress_pointer (*dst);
 }

+/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
+   from the cpymem pattern.  Return true iff we succeeded.  */
+static bool
+aarch64_expand_cpymem_mops (rtx *operands)
+{
+  if (!TARGET_MOPS)
+    return false;
+  rtx addr_dst = XEXP (operands[0], 0);
+  rtx addr_src = XEXP (operands[1], 0);
+  rtx sz_reg = operands[2];
+
+  if (!REG_P (sz_reg))
+    sz_reg = force_reg (DImode, sz_reg);
+  if (!REG_P (addr_dst))
+    addr_dst = force_reg (DImode, addr_dst);
+  if (!REG_P (addr_src))
+    addr_src = force_reg (DImode, addr_src);
+  emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
+
+  return true;
+}
+
 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
   we succeed, otherwise return false, indicating that a libcall to
   memcpy should be emitted.  */
@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands)
  rtx base;
  machine_mode cur_mode = BLKmode;

-  /* Only expand fixed-size copies.  */
+  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
  if (!CONST_INT_P (operands[2]))
-    return false;
+    return aarch64_expand_cpymem_mops (operands);

  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);

-  /* Try to inline up to 256 bytes.  */
-  unsigned HOST_WIDE_INT max_copy_size = 256;
+  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
+  unsigned HOST_WIDE_INT max_copy_size
+    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;

  bool size_p = optimize_function_for_size_p (cfun);

+  /* Large constant-sized cpymem should go through MOPS when possible.
+     It should be a win even for size optimization in the general case.
+     For speed optimization the choice between MOPS and the SIMD sequence
+     depends on the size of the copy, rather than number of instructions,
+     alignment etc.  */
  if (size > max_copy_size)
-    return false;
+    return aarch64_expand_cpymem_mops (operands);

  int copy_bits = 256;

@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands)
      nops += 2;
      n -= mode_bits;

-      /* Emit trailing copies using overlapping unaligned accesses - this is
-	 smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2)
+      /* Emit trailing copies using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
 	{
 	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands)
    }
  rtx_insn *seq = get_insns ();
  end_sequence ();
+  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
+     the constant size into a register.  */
+  unsigned mops_cost = 3 + 1;
+
+  /* If MOPS is available at this point we don't consider the libcall as it's
+     not a win even on code size.  At this point only consider MOPS if
+     optimizing for size.  For speed optimizations we will have chosen between
+     the two based on copy size already.  */
+  if (TARGET_MOPS)
+    {
+      if (size_p && mops_cost < nops)
+	return aarch64_expand_cpymem_mops (operands);
+      emit_insn (seq);
+      return true;
+    }

  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  */
+     arguments + 1 for the call.  When MOPS is not available and we're
+     optimizing for size a libcall may be preferable.  */
  unsigned libcall_cost = 4;
  if (size_p && libcall_cost < nops)
    return false;
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@ -231,14 +231,17 @@ extern unsigned aarch64_architecture_version;
 /* Pointer Authentication (PAUTH) extension.  */
 #define AARCH64_FL_PAUTH      (1ULL << 40)

+/* Armv9.0-A.  */
+#define AARCH64_FL_V9         (1ULL << 41)  /* Armv9.0-A Architecture.  */
+
 /* 64-byte atomic load/store extensions.  */
-#define AARCH64_FL_LS64      (1ULL << 41)
+#define AARCH64_FL_LS64      (1ULL << 42)

 /* Armv8.7-a architecture extensions.  */
-#define AARCH64_FL_V8_7       (1ULL << 42)
+#define AARCH64_FL_V8_7       (1ULL << 43)

-/* Armv9.0-A.  */
-#define AARCH64_FL_V9         (1ULL << 43)  /* Armv9.0-A Architecture.  */
+/* Hardware memory operation instructions.  */
+#define AARCH64_FL_MOPS       (1ULL << 44)

 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@ -310,6 +313,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_R	   (aarch64_isa_flags & AARCH64_FL_V8_R)
 #define AARCH64_ISA_PAUTH	   (aarch64_isa_flags & AARCH64_FL_PAUTH)
 #define AARCH64_ISA_V9		   (aarch64_isa_flags & AARCH64_FL_V9)
+#define AARCH64_ISA_MOPS	   (aarch64_isa_flags & AARCH64_FL_MOPS)

 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@ -401,6 +405,9 @@ extern unsigned aarch64_architecture_version;
 /* PAUTH instructions are enabled through +pauth.  */
 #define TARGET_PAUTH (AARCH64_ISA_PAUTH)

+/* MOPS instructions are enabled through +mops.  */
+#define TARGET_MOPS (AARCH64_ISA_MOPS)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
   but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
@ -1046,9 +1053,10 @@ typedef struct
   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
   for both size and speed of copy, so we will instead use the "cpymem"
   standard name to implement the copy.  This logic does not apply when
-   targeting -mstrict-align, so keep a sensible default in that case.  */
+   targeting -mstrict-align or TARGET_MOPS, so keep a sensible default in
+   that case.  */
 #define MOVE_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
+  ((!STRICT_ALIGNMENT || TARGET_MOPS) ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))

 /* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
   we would use more than 3 scalar instructions.
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@ -143,6 +143,7 @@
    UNSPEC_AUTIBSP
    UNSPEC_CALLEE_ABI
    UNSPEC_CASESI
+    UNSPEC_CPYMEM
    UNSPEC_CRC32B
    UNSPEC_CRC32CB
    UNSPEC_CRC32CH
@ -1572,6 +1573,18 @@
  }
 )

+(define_insn "aarch64_cpymemdi"
+  [(parallel [
+   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+   (clobber (match_operand:DI 0 "register_operand" "+&r"))
+   (clobber (match_operand:DI 1 "register_operand" "+&r"))
+   (set (mem:BLK (match_dup 0))
+        (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))])]
+ "TARGET_MOPS"
+ "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!"
+ [(set_attr "length" "12")]
+)
+
 ;; 0 is dst
 ;; 1 is src
 ;; 2 is size of copy in bytes
@ -1580,9 +1593,9 @@
 (define_expand "cpymemdi"
  [(match_operand:BLK 0 "memory_operand")
   (match_operand:BLK 1 "memory_operand")
-   (match_operand:DI 2 "immediate_operand")
+   (match_operand:DI 2 "general_operand")
   (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT"
+   "!STRICT_ALIGNMENT || TARGET_MOPS"
 {
  if (aarch64_expand_cpymem (operands))
    DONE;
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@ -280,3 +280,7 @@ Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4

 -param=aarch64-loop-vect-issue-rate-niters=
 Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param
+
+-param=aarch64-mops-memcpy-size-threshold=
+Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
+Constant memcpy size in bytes above which to start using MOPS sequence.
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -19144,6 +19144,9 @@ prior to Armv8.2-A is not supported.
@item ls64
 Enable the 64-byte atomic load and store instructions for accelerators.
 This option is enabled by default for @option{-march=armv8.7-a}.
+@item mops
+Enable the instructions to accelerate memory operations like @code{memcpy},
+@code{memmove}, @code{memset}.
@item flagm
 Enable the Flag Manipulation instructions Extension.
@item pauth
--- a/gcc/testsuite/gcc.target/aarch64/mops_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mops_1.c
@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memcpy-size-threshold=0" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdlib.h>
+
+/* We want to inline variable-sized memcpy.
+** do_it_cpy:
+**	cpyfp	\[x1\]\!, \[x0\]\!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]\!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy (char * in, char * out, size_t size)
+{
+  __builtin_memcpy (out, in, size);
+}
+
+/*
+** do_it_cpy_large:
+**	mov	x2, 1024
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_large (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 1024);
+}
+
+/*
+** do_it_cpy_127:
+**	mov	x2, 127
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_127 (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 127);
+}
+
+/*
+** do_it_cpy_128:
+**	mov	x2, 128
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_128 (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 128);
+}
+