[AArch64] Implement movmem for the benefit of inline memcpy

gcc/

	* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
	* config/aarch64/aarch64.c (aarch64_move_pointer): New.
	(aarch64_progress_pointer): Likewise.
	(aarch64_copy_one_part_and_move_pointers): Likewise.
	(aarch64_expand_movmen): Likewise.
	* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
	* config/aarch64/aarch64.md (movmem<mode>): New.

gcc/testsuite/

	* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
	* gcc.dg/tree-ssa/sra-12.c: Likewise.

From-SVN: r211314
This commit is contained in:
James Greenhalgh 2014-06-06 13:16:40 +00:00 committed by James Greenhalgh
parent 4ed6899575
commit e2c75eea66
8 changed files with 202 additions and 8 deletions

View File

@ -1,3 +1,13 @@
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
* config/aarch64/aarch64.c (aarch64_move_pointer): New.
(aarch64_progress_pointer): Likewise.
(aarch64_copy_one_part_and_move_pointers): Likewise.
(aarch64_expand_movmen): Likewise.
* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
* config/aarch64/aarch64.md (movmem<mode>): New.
2014-06-06 Bingfeng Mei <bmei@broadcom.com>
* targhooks.c (default_add_stmt_cost): Call target specific

View File

@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
enum aarch64_symbol_type
aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
bool aarch64_constant_address_p (rtx);
bool aarch64_expand_movmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
bool aarch64_function_arg_regno_p (unsigned);
bool aarch64_gen_movmemqi (rtx *);

View File

@ -9409,6 +9409,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
return false;
}
/* Return a new RTX holding the result of moving POINTER forward by
AMOUNT bytes. */
static rtx
aarch64_move_pointer (rtx pointer, int amount)
{
rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
return adjust_automodify_address (pointer, GET_MODE (pointer),
next, amount);
}
/* Return a new RTX holding the result of moving POINTER forward by the
size of the mode it points to. */
static rtx
aarch64_progress_pointer (rtx pointer)
{
HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
return aarch64_move_pointer (pointer, amount);
}
/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
MODE bytes. */
static void
aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
enum machine_mode mode)
{
rtx reg = gen_reg_rtx (mode);
/* "Cast" the pointers to the correct mode. */
*src = adjust_address (*src, mode, 0);
*dst = adjust_address (*dst, mode, 0);
/* Emit the memcpy. */
emit_move_insn (reg, *src);
emit_move_insn (*dst, reg);
/* Move the pointers forward. */
*src = aarch64_progress_pointer (*src);
*dst = aarch64_progress_pointer (*dst);
}
/* Expand movmem, as if from a __builtin_memcpy. Return true if
we succeed, otherwise return false. */
bool
aarch64_expand_movmem (rtx *operands)
{
unsigned int n;
rtx dst = operands[0];
rtx src = operands[1];
rtx base;
bool speed_p = !optimize_function_for_size_p (cfun);
/* When optimizing for size, give a better estimate of the length of a
memcpy call, but use the default otherwise. */
unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
/* We can't do anything smart if the amount to copy is not constant. */
if (!CONST_INT_P (operands[2]))
return false;
n = UINTVAL (operands[2]);
/* Try to keep the number of instructions low. For cases below 16 bytes we
need to make at most two moves. For cases above 16 bytes it will be one
move for each 16 byte chunk, then at most two additional moves. */
if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
return false;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
/* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
1-byte chunk. */
if (n < 4)
{
if (n >= 2)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
n -= 2;
}
if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
return true;
}
/* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
4-byte chunk, partially overlapping with the previously copied chunk. */
if (n < 8)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
n -= 4;
if (n > 0)
{
int move = n - 4;
src = aarch64_move_pointer (src, move);
dst = aarch64_move_pointer (dst, move);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
return true;
}
/* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
them, then (if applicable) an 8-byte chunk. */
while (n >= 8)
{
if (n / 16)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
n -= 16;
}
else
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
n -= 8;
}
}
/* Finish the final bytes of the copy. We can always do this in one
instruction. We either copy the exact amount we need, or partially
overlap with the previous chunk we copied and copy 8-bytes. */
if (n == 0)
return true;
else if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
else if (n == 2)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
else if (n == 4)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
else
{
if (n == 3)
{
src = aarch64_move_pointer (src, -1);
dst = aarch64_move_pointer (dst, -1);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
else
{
int move = n - 8;
src = aarch64_move_pointer (src, move);
dst = aarch64_move_pointer (dst, move);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
}
}
return true;
}
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost

View File

@ -672,12 +672,14 @@ do { \
/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */
#define AARCH64_CALL_RATIO 8
/* When optimizing for size, give a better estimate of the length of a memcpy
call, but use the default otherwise. But move_by_pieces_ninsns() counts
memory-to-memory moves, and we'll have to generate a load & store for each,
so halve the value to take that into account. */
/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
move_by_pieces will continually copy the largest safe chunks. So a
7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient
for both size and speed of copy, so we will instead use the "movmem"
standard name to implement the copy. This logic does not apply when
targeting -mstrict-align, so keep a sensible default in that case. */
#define MOVE_RATIO(speed) \
(((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
(!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
of the length of a memset call, but use the default otherwise. */

View File

@ -883,6 +883,24 @@
}
)
;; 0 is dst
;; 1 is src
;; 2 is size of move in bytes
;; 3 is alignment
(define_expand "movmemdi"
[(match_operand:BLK 0 "memory_operand")
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "immediate_operand")
(match_operand:DI 3 "immediate_operand")]
"!STRICT_ALIGNMENT"
{
if (aarch64_expand_movmem (operands))
DONE;
FAIL;
}
)
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "load_pair<mode>"

View File

@ -1,3 +1,8 @@
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
* gcc.dg/tree-ssa/sra-12.c: Likewise.
2014-06-06 Thomas Preud'homme <thomas.preudhomme@arm.com>
* gcc.c-torture/execute/bswap-2.c: Add alignment constraints to

View File

@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
/* Whether the structs are totally scalarized or not depends on the
MOVE_RATIO macro definition in the back end. The scalarization will
not take place when using small values for MOVE_RATIO. */
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -21,5 +21,5 @@ int foo (struct S *p)
*p = l;
}
/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
/* { dg-final { cleanup-tree-dump "release_ssa" } } */