aarch64: Add support for Armv8.8-a memory operations and memcpy expansion

This patch adds the +mops architecture extension flag from the 2021 Arm Architecture extensions, Armv8.8-a.
The +mops extensions introduce instructions to accelerate the memcpy, memset, memmove standard functions.
The first patch here uses the instructions in the inline memcpy expansion.
Further patches in the series will use similar instructions to inline memmove and memset.

A new param, aarch64-mops-memcpy-size-threshold, is introduced to control the size threshold above which to
emit the new sequence. Its default setting is 256 bytes, which is the same as the current threshold above
which we'd emit a libcall.

Bootstrapped and tested on aarch64-none-linux-gnu.

gcc/ChangeLog:

	* config/aarch64/aarch64-option-extensions.def (mops): Define.
	* config/aarch64/aarch64.c (aarch64_expand_cpymem_mops): Define.
	(aarch64_expand_cpymem): Define.
	* config/aarch64/aarch64.h (AARCH64_FL_MOPS): Define.
	(AARCH64_ISA_MOPS): Define.
	(TARGET_MOPS): Define.
	(MOVE_RATIO): Adjust for TARGET_MOPS.
	* config/aarch64/aarch64.md ("unspec"): Add UNSPEC_CPYMEM.
	(aarch64_cpymemdi): New pattern.
	(cpymemdi): Adjust for TARGET_MOPS.
	* config/aarch64/aarch64.opt (aarch64-mops-memcpy-size-threshol):
	New param.
	* doc/invoke.texi (AArch64 Options): Document +mops.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/mops_1.c: New test.
This commit is contained in:
Kyrylo Tkachov 2021-12-13 14:11:59 +00:00
parent 9eb8785b3f
commit 0caf592d6a
7 changed files with 149 additions and 17 deletions

View File

@ -235,4 +235,7 @@ AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg")
/* Enabling/Disabling "ls64" only changes "ls64". */
AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "")
/* Enabling/disabling "mops" only changes "mops". */
AARCH64_OPT_EXTENSION("mops", AARCH64_FL_MOPS, 0, 0, false, "")
#undef AARCH64_OPT_EXTENSION

View File

@ -23568,6 +23568,28 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
*dst = aarch64_progress_pointer (*dst);
}
/* Expand a cpymem using the MOPS extension. OPERANDS are taken
from the cpymem pattern. Return true iff we succeeded. */
static bool
aarch64_expand_cpymem_mops (rtx *operands)
{
if (!TARGET_MOPS)
return false;
rtx addr_dst = XEXP (operands[0], 0);
rtx addr_src = XEXP (operands[1], 0);
rtx sz_reg = operands[2];
if (!REG_P (sz_reg))
sz_reg = force_reg (DImode, sz_reg);
if (!REG_P (addr_dst))
addr_dst = force_reg (DImode, addr_dst);
if (!REG_P (addr_src))
addr_src = force_reg (DImode, addr_src);
emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
return true;
}
/* Expand cpymem, as if from a __builtin_memcpy. Return true if
we succeed, otherwise return false, indicating that a libcall to
memcpy should be emitted. */
@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands)
rtx base;
machine_mode cur_mode = BLKmode;
/* Only expand fixed-size copies. */
/* Variable-sized memcpy can go through the MOPS expansion if available. */
if (!CONST_INT_P (operands[2]))
return false;
return aarch64_expand_cpymem_mops (operands);
unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
/* Try to inline up to 256 bytes. */
unsigned HOST_WIDE_INT max_copy_size = 256;
/* Try to inline up to 256 bytes or use the MOPS threshold if available. */
unsigned HOST_WIDE_INT max_copy_size
= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
bool size_p = optimize_function_for_size_p (cfun);
/* Large constant-sized cpymem should go through MOPS when possible.
It should be a win even for size optimization in the general case.
For speed optimization the choice between MOPS and the SIMD sequence
depends on the size of the copy, rather than number of instructions,
alignment etc. */
if (size > max_copy_size)
return false;
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands)
nops += 2;
n -= mode_bits;
/* Emit trailing copies using overlapping unaligned accesses - this is
smaller and faster. */
if (n > 0 && n < copy_bits / 2)
/* Emit trailing copies using overlapping unaligned accesses
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
{
machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands)
}
rtx_insn *seq = get_insns ();
end_sequence ();
/* MOPS sequence requires 3 instructions for the memory copying + 1 to move
the constant size into a register. */
unsigned mops_cost = 3 + 1;
/* If MOPS is available at this point we don't consider the libcall as it's
not a win even on code size. At this point only consider MOPS if
optimizing for size. For speed optimizations we will have chosen between
the two based on copy size already. */
if (TARGET_MOPS)
{
if (size_p && mops_cost < nops)
return aarch64_expand_cpymem_mops (operands);
emit_insn (seq);
return true;
}
/* A memcpy libcall in the worst case takes 3 instructions to prepare the
arguments + 1 for the call. */
arguments + 1 for the call. When MOPS is not available and we're
optimizing for size a libcall may be preferable. */
unsigned libcall_cost = 4;
if (size_p && libcall_cost < nops)
return false;

View File

@ -231,14 +231,17 @@ extern unsigned aarch64_architecture_version;
/* Pointer Authentication (PAUTH) extension. */
#define AARCH64_FL_PAUTH (1ULL << 40)
/* Armv9.0-A. */
#define AARCH64_FL_V9 (1ULL << 41) /* Armv9.0-A Architecture. */
/* 64-byte atomic load/store extensions. */
#define AARCH64_FL_LS64 (1ULL << 41)
#define AARCH64_FL_LS64 (1ULL << 42)
/* Armv8.7-a architecture extensions. */
#define AARCH64_FL_V8_7 (1ULL << 42)
#define AARCH64_FL_V8_7 (1ULL << 43)
/* Armv9.0-A. */
#define AARCH64_FL_V9 (1ULL << 43) /* Armv9.0-A Architecture. */
/* Hardware memory operation instructions. */
#define AARCH64_FL_MOPS (1ULL << 44)
/* Has FP and SIMD. */
#define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
@ -310,6 +313,7 @@ extern unsigned aarch64_architecture_version;
#define AARCH64_ISA_V8_R (aarch64_isa_flags & AARCH64_FL_V8_R)
#define AARCH64_ISA_PAUTH (aarch64_isa_flags & AARCH64_FL_PAUTH)
#define AARCH64_ISA_V9 (aarch64_isa_flags & AARCH64_FL_V9)
#define AARCH64_ISA_MOPS (aarch64_isa_flags & AARCH64_FL_MOPS)
/* Crypto is an optional extension to AdvSIMD. */
#define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@ -401,6 +405,9 @@ extern unsigned aarch64_architecture_version;
/* PAUTH instructions are enabled through +pauth. */
#define TARGET_PAUTH (AARCH64_ISA_PAUTH)
/* MOPS instructions are enabled through +mops. */
#define TARGET_MOPS (AARCH64_ISA_MOPS)
/* Make sure this is always defined so we don't have to check for ifdefs
but rather use normal ifs. */
#ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
@ -1046,9 +1053,10 @@ typedef struct
7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient
for both size and speed of copy, so we will instead use the "cpymem"
standard name to implement the copy. This logic does not apply when
targeting -mstrict-align, so keep a sensible default in that case. */
targeting -mstrict-align or TARGET_MOPS, so keep a sensible default in
that case. */
#define MOVE_RATIO(speed) \
(!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
((!STRICT_ALIGNMENT || TARGET_MOPS) ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
we would use more than 3 scalar instructions.

View File

@ -143,6 +143,7 @@
UNSPEC_AUTIBSP
UNSPEC_CALLEE_ABI
UNSPEC_CASESI
UNSPEC_CPYMEM
UNSPEC_CRC32B
UNSPEC_CRC32CB
UNSPEC_CRC32CH
@ -1572,6 +1573,18 @@
}
)
(define_insn "aarch64_cpymemdi"
[(parallel [
(set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
(clobber (match_operand:DI 0 "register_operand" "+&r"))
(clobber (match_operand:DI 1 "register_operand" "+&r"))
(set (mem:BLK (match_dup 0))
(unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))])]
"TARGET_MOPS"
"cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!"
[(set_attr "length" "12")]
)
;; 0 is dst
;; 1 is src
;; 2 is size of copy in bytes
@ -1580,9 +1593,9 @@
(define_expand "cpymemdi"
[(match_operand:BLK 0 "memory_operand")
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "immediate_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
"!STRICT_ALIGNMENT"
"!STRICT_ALIGNMENT || TARGET_MOPS"
{
if (aarch64_expand_cpymem (operands))
DONE;

View File

@ -280,3 +280,7 @@ Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4
-param=aarch64-loop-vect-issue-rate-niters=
Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param
-param=aarch64-mops-memcpy-size-threshold=
Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
Constant memcpy size in bytes above which to start using MOPS sequence.

View File

@ -19144,6 +19144,9 @@ prior to Armv8.2-A is not supported.
@item ls64
Enable the 64-byte atomic load and store instructions for accelerators.
This option is enabled by default for @option{-march=armv8.7-a}.
@item mops
Enable the instructions to accelerate memory operations like @code{memcpy},
@code{memmove}, @code{memset}.
@item flagm
Enable the Flag Manipulation instructions Extension.
@item pauth

View File

@ -0,0 +1,57 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memcpy-size-threshold=0" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <stdlib.h>
/* We want to inline variable-sized memcpy.
** do_it_cpy:
** cpyfp \[x1\]\!, \[x0\]\!, x2\!
** cpyfm \[x1\]\!, \[x0\]\!, x2\!
** cpyfe \[x1\]\!, \[x0\]\!, x2\!
** ret
*/
void do_it_cpy (char * in, char * out, size_t size)
{
__builtin_memcpy (out, in, size);
}
/*
** do_it_cpy_large:
** mov x2, 1024
** cpyfp \[x1\]\!, \[x0\]!, x2\!
** cpyfm \[x1\]\!, \[x0\]!, x2\!
** cpyfe \[x1\]\!, \[x0\]\!, x2\!
** ret
*/
void do_it_cpy_large (char * in, char * out)
{
__builtin_memcpy (out, in, 1024);
}
/*
** do_it_cpy_127:
** mov x2, 127
** cpyfp \[x1\]\!, \[x0\]!, x2\!
** cpyfm \[x1\]\!, \[x0\]!, x2\!
** cpyfe \[x1\]\!, \[x0\]\!, x2\!
** ret
*/
void do_it_cpy_127 (char * in, char * out)
{
__builtin_memcpy (out, in, 127);
}
/*
** do_it_cpy_128:
** mov x2, 128
** cpyfp \[x1\]\!, \[x0\]!, x2\!
** cpyfm \[x1\]\!, \[x0\]!, x2\!
** cpyfe \[x1\]\!, \[x0\]\!, x2\!
** ret
*/
void do_it_cpy_128 (char * in, char * out)
{
__builtin_memcpy (out, in, 128);
}