diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index b61f1df9019..3f449fba415 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -235,4 +235,7 @@ AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg") /* Enabling/Disabling "ls64" only changes "ls64". */ AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "") +/* Enabling/disabling "mops" only changes "mops". */ +AARCH64_OPT_EXTENSION("mops", AARCH64_FL_MOPS, 0, 0, false, "") + #undef AARCH64_OPT_EXTENSION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index be24b7320d2..bd754e4e2c2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23568,6 +23568,28 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, *dst = aarch64_progress_pointer (*dst); } +/* Expand a cpymem using the MOPS extension. OPERANDS are taken + from the cpymem pattern. Return true iff we succeeded. */ +static bool +aarch64_expand_cpymem_mops (rtx *operands) +{ + if (!TARGET_MOPS) + return false; + rtx addr_dst = XEXP (operands[0], 0); + rtx addr_src = XEXP (operands[1], 0); + rtx sz_reg = operands[2]; + + if (!REG_P (sz_reg)) + sz_reg = force_reg (DImode, sz_reg); + if (!REG_P (addr_dst)) + addr_dst = force_reg (DImode, addr_dst); + if (!REG_P (addr_src)) + addr_src = force_reg (DImode, addr_src); + emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg)); + + return true; +} + /* Expand cpymem, as if from a __builtin_memcpy. Return true if we succeed, otherwise return false, indicating that a libcall to memcpy should be emitted. */ @@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands) rtx base; machine_mode cur_mode = BLKmode; - /* Only expand fixed-size copies. */ + /* Variable-sized memcpy can go through the MOPS expansion if available. */ if (!CONST_INT_P (operands[2])) - return false; + return aarch64_expand_cpymem_mops (operands); unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - /* Try to inline up to 256 bytes. */ - unsigned HOST_WIDE_INT max_copy_size = 256; + /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ + unsigned HOST_WIDE_INT max_copy_size + = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; bool size_p = optimize_function_for_size_p (cfun); + /* Large constant-sized cpymem should go through MOPS when possible. + It should be a win even for size optimization in the general case. + For speed optimization the choice between MOPS and the SIMD sequence + depends on the size of the copy, rather than number of instructions, + alignment etc. */ if (size > max_copy_size) - return false; + return aarch64_expand_cpymem_mops (operands); int copy_bits = 256; @@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands) nops += 2; n -= mode_bits; - /* Emit trailing copies using overlapping unaligned accesses - this is - smaller and faster. */ - if (n > 0 && n < copy_bits / 2) + /* Emit trailing copies using overlapping unaligned accesses + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ + if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT) { machine_mode next_mode = smallest_mode_for_size (n, MODE_INT); int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); @@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands) } rtx_insn *seq = get_insns (); end_sequence (); + /* MOPS sequence requires 3 instructions for the memory copying + 1 to move + the constant size into a register. */ + unsigned mops_cost = 3 + 1; + + /* If MOPS is available at this point we don't consider the libcall as it's + not a win even on code size. At this point only consider MOPS if + optimizing for size. For speed optimizations we will have chosen between + the two based on copy size already. */ + if (TARGET_MOPS) + { + if (size_p && mops_cost < nops) + return aarch64_expand_cpymem_mops (operands); + emit_insn (seq); + return true; + } /* A memcpy libcall in the worst case takes 3 instructions to prepare the - arguments + 1 for the call. */ + arguments + 1 for the call. When MOPS is not available and we're + optimizing for size a libcall may be preferable. */ unsigned libcall_cost = 4; if (size_p && libcall_cost < nops) return false; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 2792bb29adb..79d0bcd357f 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -231,14 +231,17 @@ extern unsigned aarch64_architecture_version; /* Pointer Authentication (PAUTH) extension. */ #define AARCH64_FL_PAUTH (1ULL << 40) +/* Armv9.0-A. */ +#define AARCH64_FL_V9 (1ULL << 41) /* Armv9.0-A Architecture. */ + /* 64-byte atomic load/store extensions. */ -#define AARCH64_FL_LS64 (1ULL << 41) +#define AARCH64_FL_LS64 (1ULL << 42) /* Armv8.7-a architecture extensions. */ -#define AARCH64_FL_V8_7 (1ULL << 42) +#define AARCH64_FL_V8_7 (1ULL << 43) -/* Armv9.0-A. */ -#define AARCH64_FL_V9 (1ULL << 43) /* Armv9.0-A Architecture. */ +/* Hardware memory operation instructions. */ +#define AARCH64_FL_MOPS (1ULL << 44) /* Has FP and SIMD. */ #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) @@ -310,6 +313,7 @@ extern unsigned aarch64_architecture_version; #define AARCH64_ISA_V8_R (aarch64_isa_flags & AARCH64_FL_V8_R) #define AARCH64_ISA_PAUTH (aarch64_isa_flags & AARCH64_FL_PAUTH) #define AARCH64_ISA_V9 (aarch64_isa_flags & AARCH64_FL_V9) +#define AARCH64_ISA_MOPS (aarch64_isa_flags & AARCH64_FL_MOPS) /* Crypto is an optional extension to AdvSIMD. */ #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) @@ -401,6 +405,9 @@ extern unsigned aarch64_architecture_version; /* PAUTH instructions are enabled through +pauth. */ #define TARGET_PAUTH (AARCH64_ISA_PAUTH) +/* MOPS instructions are enabled through +mops. */ +#define TARGET_MOPS (AARCH64_ISA_MOPS) + /* Make sure this is always defined so we don't have to check for ifdefs but rather use normal ifs. */ #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT @@ -1046,9 +1053,10 @@ typedef struct 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient for both size and speed of copy, so we will instead use the "cpymem" standard name to implement the copy. This logic does not apply when - targeting -mstrict-align, so keep a sensible default in that case. */ + targeting -mstrict-align or TARGET_MOPS, so keep a sensible default in + that case. */ #define MOVE_RATIO(speed) \ - (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) + ((!STRICT_ALIGNMENT || TARGET_MOPS) ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) /* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when we would use more than 3 scalar instructions. diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 5297b2d3f95..d623c1b00bf 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -143,6 +143,7 @@ UNSPEC_AUTIBSP UNSPEC_CALLEE_ABI UNSPEC_CASESI + UNSPEC_CPYMEM UNSPEC_CRC32B UNSPEC_CRC32CB UNSPEC_CRC32CH @@ -1572,6 +1573,18 @@ } ) +(define_insn "aarch64_cpymemdi" + [(parallel [ + (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0)) + (clobber (match_operand:DI 0 "register_operand" "+&r")) + (clobber (match_operand:DI 1 "register_operand" "+&r")) + (set (mem:BLK (match_dup 0)) + (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))])] + "TARGET_MOPS" + "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!" + [(set_attr "length" "12")] +) + ;; 0 is dst ;; 1 is src ;; 2 is size of copy in bytes @@ -1580,9 +1593,9 @@ (define_expand "cpymemdi" [(match_operand:BLK 0 "memory_operand") (match_operand:BLK 1 "memory_operand") - (match_operand:DI 2 "immediate_operand") + (match_operand:DI 2 "general_operand") (match_operand:DI 3 "immediate_operand")] - "!STRICT_ALIGNMENT" + "!STRICT_ALIGNMENT || TARGET_MOPS" { if (aarch64_expand_cpymem (operands)) DONE; diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 32191cf1acf..7445ed106cc 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -280,3 +280,7 @@ Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4 -param=aarch64-loop-vect-issue-rate-niters= Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param + +-param=aarch64-mops-memcpy-size-threshold= +Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param +Constant memcpy size in bytes above which to start using MOPS sequence. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 9b4371b9213..2424a5bf3e0 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -19144,6 +19144,9 @@ prior to Armv8.2-A is not supported. @item ls64 Enable the 64-byte atomic load and store instructions for accelerators. This option is enabled by default for @option{-march=armv8.7-a}. +@item mops +Enable the instructions to accelerate memory operations like @code{memcpy}, +@code{memmove}, @code{memset}. @item flagm Enable the Flag Manipulation instructions Extension. @item pauth diff --git a/gcc/testsuite/gcc.target/aarch64/mops_1.c b/gcc/testsuite/gcc.target/aarch64/mops_1.c new file mode 100644 index 00000000000..661c14192e8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mops_1.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memcpy-size-threshold=0" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +/* We want to inline variable-sized memcpy. +** do_it_cpy: +** cpyfp \[x1\]\!, \[x0\]\!, x2\! +** cpyfm \[x1\]\!, \[x0\]\!, x2\! +** cpyfe \[x1\]\!, \[x0\]\!, x2\! +** ret +*/ +void do_it_cpy (char * in, char * out, size_t size) +{ + __builtin_memcpy (out, in, size); +} + +/* +** do_it_cpy_large: +** mov x2, 1024 +** cpyfp \[x1\]\!, \[x0\]!, x2\! +** cpyfm \[x1\]\!, \[x0\]!, x2\! +** cpyfe \[x1\]\!, \[x0\]\!, x2\! +** ret +*/ +void do_it_cpy_large (char * in, char * out) +{ + __builtin_memcpy (out, in, 1024); +} + +/* +** do_it_cpy_127: +** mov x2, 127 +** cpyfp \[x1\]\!, \[x0\]!, x2\! +** cpyfm \[x1\]\!, \[x0\]!, x2\! +** cpyfe \[x1\]\!, \[x0\]\!, x2\! +** ret +*/ +void do_it_cpy_127 (char * in, char * out) +{ + __builtin_memcpy (out, in, 127); +} + +/* +** do_it_cpy_128: +** mov x2, 128 +** cpyfp \[x1\]\!, \[x0\]!, x2\! +** cpyfm \[x1\]\!, \[x0\]!, x2\! +** cpyfe \[x1\]\!, \[x0\]\!, x2\! +** ret +*/ +void do_it_cpy_128 (char * in, char * out) +{ + __builtin_memcpy (out, in, 128); +} +