aarch64: Add backend support for expanding __builtin_memset
This patch implements aarch64 backend expansion for __builtin_memset. Most of the implementation is based on the expansion of __builtin_memcpy. We change the values of SET_RATIO and MOVE_RATIO for cases where we do not have to strictly align and where we can benefit from NEON instructions in the backend. gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_expand_setmem): New declaration. * config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case for E_V16QImode. (aarch64_set_one_block_and_progress_pointer): New helper for aarch64_expand_setmem. (aarch64_expand_setmem): Define the expansion for memset. * config/aarch64/aarch64.h (CLEAR_RATIO): Tweak to favor aarch64_expand_setmem when allowed and profitable. (SET_RATIO): Likewise. * config/aarch64/aarch64.md: Define pattern for setmemdi. gcc/testsuite/ChangeLog: * g++.dg/tree-ssa/pr90883.C: Remove xfail for aarch64. * gcc.dg/tree-prof/stringop-2.c: Add xfail for aarch64. * gcc.target/aarch64/memset-corner-cases.c: New test. * gcc.target/aarch64/memset-q-reg.c: New test.
This commit is contained in:
parent
5e28fca09c
commit
54bbde550e
@ -510,6 +510,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
|
||||
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
|
||||
void aarch64_expand_call (rtx, rtx, rtx, bool);
|
||||
bool aarch64_expand_cpymem (rtx *);
|
||||
bool aarch64_expand_setmem (rtx *);
|
||||
bool aarch64_float_const_zero_rtx_p (rtx);
|
||||
bool aarch64_float_const_rtx_p (rtx);
|
||||
bool aarch64_function_arg_regno_p (unsigned);
|
||||
|
@ -7030,6 +7030,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
|
||||
case E_V4SImode:
|
||||
return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
|
||||
|
||||
case E_V16QImode:
|
||||
return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
@ -21276,6 +21279,135 @@ aarch64_expand_cpymem (rtx *operands)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
|
||||
SRC is a register we have created with the duplicated value to be set. */
|
||||
static void
|
||||
aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
|
||||
machine_mode mode)
|
||||
{
|
||||
/* If we are copying 128bits or 256bits, we can do that straight from
|
||||
the SIMD register we prepared. */
|
||||
if (known_eq (GET_MODE_BITSIZE (mode), 256))
|
||||
{
|
||||
mode = GET_MODE (src);
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, mode, 0);
|
||||
/* Emit the memset. */
|
||||
emit_insn (aarch64_gen_store_pair (mode, *dst, src,
|
||||
aarch64_progress_pointer (*dst), src));
|
||||
|
||||
/* Move the pointers forward. */
|
||||
*dst = aarch64_move_pointer (*dst, 32);
|
||||
return;
|
||||
}
|
||||
if (known_eq (GET_MODE_BITSIZE (mode), 128))
|
||||
{
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, GET_MODE (src), 0);
|
||||
/* Emit the memset. */
|
||||
emit_move_insn (*dst, src);
|
||||
/* Move the pointers forward. */
|
||||
*dst = aarch64_move_pointer (*dst, 16);
|
||||
return;
|
||||
}
|
||||
/* For copying less, we have to extract the right amount from src. */
|
||||
rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
|
||||
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, mode, 0);
|
||||
/* Emit the memset. */
|
||||
emit_move_insn (*dst, reg);
|
||||
/* Move the pointer forward. */
|
||||
*dst = aarch64_progress_pointer (*dst);
|
||||
}
|
||||
|
||||
/* Expand setmem, as if from a __builtin_memset. Return true if
|
||||
we succeed, otherwise return false. */
|
||||
|
||||
bool
|
||||
aarch64_expand_setmem (rtx *operands)
|
||||
{
|
||||
int n, mode_bits;
|
||||
unsigned HOST_WIDE_INT len;
|
||||
rtx dst = operands[0];
|
||||
rtx val = operands[2], src;
|
||||
rtx base;
|
||||
machine_mode cur_mode = BLKmode, next_mode;
|
||||
|
||||
/* We can't do anything smart if the amount to copy is not constant. */
|
||||
if (!CONST_INT_P (operands[1]))
|
||||
return false;
|
||||
|
||||
bool speed_p = !optimize_function_for_size_p (cfun);
|
||||
|
||||
/* Default the maximum to 256-bytes. */
|
||||
unsigned max_set_size = 256;
|
||||
|
||||
/* In case we are optimizing for size or if the core does not
|
||||
want to use STP Q regs, lower the max_set_size. */
|
||||
max_set_size = (!speed_p
|
||||
|| (aarch64_tune_params.extra_tuning_flags
|
||||
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
|
||||
? max_set_size / 2 : max_set_size;
|
||||
|
||||
len = INTVAL (operands[1]);
|
||||
|
||||
/* Upper bound check. */
|
||||
if (len > max_set_size)
|
||||
return false;
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
|
||||
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
|
||||
|
||||
/* Prepare the val using a DUP/MOVI v0.16B, val. */
|
||||
src = expand_vector_broadcast (V16QImode, val);
|
||||
src = force_reg (V16QImode, src);
|
||||
|
||||
/* Convert len to bits to make the rest of the code simpler. */
|
||||
n = len * BITS_PER_UNIT;
|
||||
|
||||
/* Maximum amount to copy in one go. We allow 256-bit chunks based on the
|
||||
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
|
||||
pattern is only turned on for TARGET_SIMD. */
|
||||
const int copy_limit = (speed_p
|
||||
&& (aarch64_tune_params.extra_tuning_flags
|
||||
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
|
||||
? GET_MODE_BITSIZE (TImode) : 256;
|
||||
|
||||
while (n > 0)
|
||||
{
|
||||
/* Find the largest mode in which to do the copy without
|
||||
over writing. */
|
||||
opt_scalar_int_mode mode_iter;
|
||||
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
|
||||
if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
|
||||
cur_mode = mode_iter.require ();
|
||||
|
||||
gcc_assert (cur_mode != BLKmode);
|
||||
|
||||
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
|
||||
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
|
||||
|
||||
n -= mode_bits;
|
||||
|
||||
/* Do certain trailing copies as overlapping if it's going to be
|
||||
cheaper. i.e. less instructions to do so. For instance doing a 15
|
||||
byte copy it's more efficient to do two overlapping 8 byte copies than
|
||||
8 + 4 + 2 + 1. */
|
||||
if (n > 0 && n < copy_limit / 2)
|
||||
{
|
||||
next_mode = smallest_mode_for_size (n, MODE_INT);
|
||||
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
|
||||
gcc_assert (n_bits <= mode_bits);
|
||||
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
|
||||
n = n_bits;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/* Split a DImode store of a CONST_INT SRC to MEM DST as two
|
||||
SImode stores. Handle the case when the constant has identical
|
||||
bottom and top halves. This is beneficial when the two stores can be
|
||||
|
@ -1024,16 +1024,19 @@ typedef struct
|
||||
#define MOVE_RATIO(speed) \
|
||||
(!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
|
||||
|
||||
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
|
||||
of the length of a memset call, but use the default otherwise. */
|
||||
/* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
|
||||
we would use more than 3 scalar instructions.
|
||||
Otherwise follow a sensible default: when optimizing for size, give a better
|
||||
estimate of the length of a memset call, but use the default otherwise. */
|
||||
#define CLEAR_RATIO(speed) \
|
||||
((speed) ? 15 : AARCH64_CALL_RATIO)
|
||||
(!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
|
||||
|
||||
/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant, so when
|
||||
optimizing for size adjust the ratio to account for the overhead of loading
|
||||
the constant. */
|
||||
/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant. Without
|
||||
-mstrict-align, make decisions in "setmem". Otherwise follow a sensible
|
||||
default: when optimizing for size adjust the ratio to account for the
|
||||
overhead of loading the constant. */
|
||||
#define SET_RATIO(speed) \
|
||||
((speed) ? 15 : AARCH64_CALL_RATIO - 2)
|
||||
(!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
|
||||
|
||||
/* Disable auto-increment in move_by_pieces et al. Use of auto-increment is
|
||||
rarely a good idea in straight-line code since it adds an extra address
|
||||
|
@ -1571,6 +1571,24 @@
|
||||
}
|
||||
)
|
||||
|
||||
;; 0 is dst
|
||||
;; 1 is val
|
||||
;; 2 is size of copy in bytes
|
||||
;; 3 is alignment
|
||||
|
||||
(define_expand "setmemdi"
|
||||
[(set (match_operand:BLK 0 "memory_operand") ;; Dest
|
||||
(match_operand:QI 2 "nonmemory_operand")) ;; Value
|
||||
(use (match_operand:DI 1 "immediate_operand")) ;; Length
|
||||
(match_operand 3 "immediate_operand")] ;; Align
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (aarch64_expand_setmem (operands))
|
||||
DONE;
|
||||
|
||||
FAIL;
|
||||
})
|
||||
|
||||
;; Operands 1 and 3 are tied together by the final condition; so we allow
|
||||
;; fairly lax checking on the second memory operation.
|
||||
(define_insn "load_pair_sw_<SX:mode><SX2:mode>"
|
||||
|
@ -15,6 +15,6 @@
|
||||
|
||||
// We want to match enough here to capture that we deleted an empty
|
||||
// constructor store
|
||||
// aarch64 and mips will expand to loop to clear because CLEAR_RATIO.
|
||||
// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } }
|
||||
// mips will expand to loop to clear because CLEAR_RATIO.
|
||||
// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { mips*-*-* } } } }
|
||||
|
||||
|
@ -20,6 +20,6 @@ main()
|
||||
return 0;
|
||||
}
|
||||
/* autofdo doesn't support value profiling for now: */
|
||||
/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile"} } */
|
||||
/* { dg-final-use-not-autofdo { scan-ipa-dump "Transformation done: single value 4 stringop" "profile" { target { ! aarch64*-*-* } } } } */
|
||||
/* The versioned memset of size 4 should be optimized to an assignment.
|
||||
{ dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" } } */
|
||||
{ dg-final-use-not-autofdo { scan-tree-dump "MEM <\[a-z \]+> \\\[\\(void .\\)&a\\\] = 168430090" "optimized" { target { ! aarch64*-*-* } } } } */
|
||||
|
88
gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
Normal file
88
gcc/testsuite/gcc.target/aarch64/memset-corner-cases.c
Normal file
@ -0,0 +1,88 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
/* { dg-require-effective-target lp64 } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* One byte variable set should be scalar
|
||||
**set1byte:
|
||||
** strb w1, \[x0\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set1byte (int64_t *src, char c)
|
||||
{
|
||||
__builtin_memset (src, c, 1);
|
||||
}
|
||||
|
||||
/* Special cases for setting 0. */
|
||||
/* 1-byte should be STRB with wzr
|
||||
**set0byte:
|
||||
** strb wzr, \[x0\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set0byte (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 0, 1);
|
||||
}
|
||||
|
||||
/* 35bytes would become 4 scalar instructions. So favour NEON.
|
||||
**set0neon:
|
||||
** movi v0.4s, 0
|
||||
** stp q0, q0, \[x0\]
|
||||
** str wzr, \[x0, 31\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set0neon (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 0, 35);
|
||||
}
|
||||
|
||||
/* 36bytes should be scalar however.
|
||||
**set0scalar:
|
||||
** stp xzr, xzr, \[x0\]
|
||||
** stp xzr, xzr, \[x0, 16\]
|
||||
** str wzr, \[x0, 32\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set0scalar (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 0, 36);
|
||||
}
|
||||
|
||||
|
||||
/* 256-bytes expanded
|
||||
**set256byte:
|
||||
** dup v0.16b, w1
|
||||
** stp q0, q0, \[x0\]
|
||||
** stp q0, q0, \[x0, 32\]
|
||||
** stp q0, q0, \[x0, 64\]
|
||||
** stp q0, q0, \[x0, 96\]
|
||||
** stp q0, q0, \[x0, 128\]
|
||||
** stp q0, q0, \[x0, 160\]
|
||||
** stp q0, q0, \[x0, 192\]
|
||||
** stp q0, q0, \[x0, 224\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set256byte (int64_t *src, char c)
|
||||
{
|
||||
__builtin_memset (src, c, 256);
|
||||
}
|
||||
|
||||
/* More than 256 bytes goes to memset
|
||||
**set257byte:
|
||||
** mov x2, 257
|
||||
** mov w1, 99
|
||||
** b memset
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set257byte (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 'c', 257);
|
||||
}
|
||||
|
||||
/* { dg-final { check-function-bodies "**" "" "" } } */
|
81
gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
Normal file
81
gcc/testsuite/gcc.target/aarch64/memset-q-reg.c
Normal file
@ -0,0 +1,81 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
/* { dg-require-effective-target lp64 } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
**set128bits:
|
||||
** dup v0.16b, w1
|
||||
** str q0, \[x0\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set128bits (int64_t *src, char c)
|
||||
{
|
||||
__builtin_memset (src, c, 2*sizeof(int64_t));
|
||||
}
|
||||
|
||||
/*
|
||||
**set128bitszero:
|
||||
** stp xzr, xzr, \[x0\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set128bitszero (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 0, 2*sizeof(int64_t));
|
||||
}
|
||||
|
||||
/*
|
||||
** set128bitsplus:
|
||||
** dup v0.16b, w1
|
||||
** str q0, \[x0\]
|
||||
** str q0, \[x0, 12\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set128bitsplus (int64_t *src, char c)
|
||||
{
|
||||
__builtin_memset (src, c, 7*sizeof(int32_t));
|
||||
}
|
||||
|
||||
/*
|
||||
** set256bits:
|
||||
** movi v0.16b, 0x63
|
||||
** stp q0, q0, \[x0\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set256bits (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 'c', 4*sizeof(int64_t));
|
||||
}
|
||||
|
||||
/*
|
||||
**set256bitszero:
|
||||
** stp xzr, xzr, \[x0\]
|
||||
** stp xzr, xzr, \[x0, 16\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set256bitszero (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 0, 4*sizeof(int64_t));
|
||||
}
|
||||
|
||||
/*
|
||||
** set256bitsplus:
|
||||
** movi v0.16b, 0x63
|
||||
** stp q0, q0, \[x0\]
|
||||
** str q0, \[x0, 32\]
|
||||
** str d0, \[x0, 48\]
|
||||
** ret
|
||||
*/
|
||||
void __attribute__((__noinline__))
|
||||
set256bitsplus (int64_t *src)
|
||||
{
|
||||
__builtin_memset (src, 'c', 7*sizeof(int64_t));
|
||||
}
|
||||
|
||||
/* { dg-final { check-function-bodies "**" "" "" } } */
|
Loading…
Reference in New Issue
Block a user