[AArch64] Implement movmem for the benefit of inline memcpy
gcc/ * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. * config/aarch64/aarch64.c (aarch64_move_pointer): New. (aarch64_progress_pointer): Likewise. (aarch64_copy_one_part_and_move_pointers): Likewise. (aarch64_expand_movmen): Likewise. * config/aarch64/aarch64.h (MOVE_RATIO): Set low. * config/aarch64/aarch64.md (movmem<mode>): New. gcc/testsuite/ * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. * gcc.dg/tree-ssa/sra-12.c: Likewise. From-SVN: r211314
This commit is contained in:
parent
4ed6899575
commit
e2c75eea66
@ -1,3 +1,13 @@
|
||||
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
|
||||
|
||||
* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
|
||||
* config/aarch64/aarch64.c (aarch64_move_pointer): New.
|
||||
(aarch64_progress_pointer): Likewise.
|
||||
(aarch64_copy_one_part_and_move_pointers): Likewise.
|
||||
(aarch64_expand_movmen): Likewise.
|
||||
* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
|
||||
* config/aarch64/aarch64.md (movmem<mode>): New.
|
||||
|
||||
2014-06-06 Bingfeng Mei <bmei@broadcom.com>
|
||||
|
||||
* targhooks.c (default_add_stmt_cost): Call target specific
|
||||
|
@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
|
||||
enum aarch64_symbol_type
|
||||
aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
|
||||
bool aarch64_constant_address_p (rtx);
|
||||
bool aarch64_expand_movmem (rtx *);
|
||||
bool aarch64_float_const_zero_rtx_p (rtx);
|
||||
bool aarch64_function_arg_regno_p (unsigned);
|
||||
bool aarch64_gen_movmemqi (rtx *);
|
||||
|
@ -9409,6 +9409,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Return a new RTX holding the result of moving POINTER forward by
|
||||
AMOUNT bytes. */
|
||||
|
||||
static rtx
|
||||
aarch64_move_pointer (rtx pointer, int amount)
|
||||
{
|
||||
rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
|
||||
|
||||
return adjust_automodify_address (pointer, GET_MODE (pointer),
|
||||
next, amount);
|
||||
}
|
||||
|
||||
/* Return a new RTX holding the result of moving POINTER forward by the
|
||||
size of the mode it points to. */
|
||||
|
||||
static rtx
|
||||
aarch64_progress_pointer (rtx pointer)
|
||||
{
|
||||
HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
|
||||
|
||||
return aarch64_move_pointer (pointer, amount);
|
||||
}
|
||||
|
||||
/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
|
||||
MODE bytes. */
|
||||
|
||||
static void
|
||||
aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
|
||||
enum machine_mode mode)
|
||||
{
|
||||
rtx reg = gen_reg_rtx (mode);
|
||||
|
||||
/* "Cast" the pointers to the correct mode. */
|
||||
*src = adjust_address (*src, mode, 0);
|
||||
*dst = adjust_address (*dst, mode, 0);
|
||||
/* Emit the memcpy. */
|
||||
emit_move_insn (reg, *src);
|
||||
emit_move_insn (*dst, reg);
|
||||
/* Move the pointers forward. */
|
||||
*src = aarch64_progress_pointer (*src);
|
||||
*dst = aarch64_progress_pointer (*dst);
|
||||
}
|
||||
|
||||
/* Expand movmem, as if from a __builtin_memcpy. Return true if
|
||||
we succeed, otherwise return false. */
|
||||
|
||||
bool
|
||||
aarch64_expand_movmem (rtx *operands)
|
||||
{
|
||||
unsigned int n;
|
||||
rtx dst = operands[0];
|
||||
rtx src = operands[1];
|
||||
rtx base;
|
||||
bool speed_p = !optimize_function_for_size_p (cfun);
|
||||
|
||||
/* When optimizing for size, give a better estimate of the length of a
|
||||
memcpy call, but use the default otherwise. */
|
||||
unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
|
||||
|
||||
/* We can't do anything smart if the amount to copy is not constant. */
|
||||
if (!CONST_INT_P (operands[2]))
|
||||
return false;
|
||||
|
||||
n = UINTVAL (operands[2]);
|
||||
|
||||
/* Try to keep the number of instructions low. For cases below 16 bytes we
|
||||
need to make at most two moves. For cases above 16 bytes it will be one
|
||||
move for each 16 byte chunk, then at most two additional moves. */
|
||||
if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
|
||||
return false;
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
|
||||
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
|
||||
src = adjust_automodify_address (src, VOIDmode, base, 0);
|
||||
|
||||
/* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
|
||||
1-byte chunk. */
|
||||
if (n < 4)
|
||||
{
|
||||
if (n >= 2)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
|
||||
n -= 2;
|
||||
}
|
||||
|
||||
if (n == 1)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
|
||||
4-byte chunk, partially overlapping with the previously copied chunk. */
|
||||
if (n < 8)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
n -= 4;
|
||||
if (n > 0)
|
||||
{
|
||||
int move = n - 4;
|
||||
|
||||
src = aarch64_move_pointer (src, move);
|
||||
dst = aarch64_move_pointer (dst, move);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
|
||||
them, then (if applicable) an 8-byte chunk. */
|
||||
while (n >= 8)
|
||||
{
|
||||
if (n / 16)
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
|
||||
n -= 16;
|
||||
}
|
||||
else
|
||||
{
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
|
||||
n -= 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* Finish the final bytes of the copy. We can always do this in one
|
||||
instruction. We either copy the exact amount we need, or partially
|
||||
overlap with the previous chunk we copied and copy 8-bytes. */
|
||||
if (n == 0)
|
||||
return true;
|
||||
else if (n == 1)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
|
||||
else if (n == 2)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
|
||||
else if (n == 4)
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
else
|
||||
{
|
||||
if (n == 3)
|
||||
{
|
||||
src = aarch64_move_pointer (src, -1);
|
||||
dst = aarch64_move_pointer (dst, -1);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
|
||||
}
|
||||
else
|
||||
{
|
||||
int move = n - 8;
|
||||
|
||||
src = aarch64_move_pointer (src, move);
|
||||
dst = aarch64_move_pointer (dst, move);
|
||||
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#undef TARGET_ADDRESS_COST
|
||||
#define TARGET_ADDRESS_COST aarch64_address_cost
|
||||
|
||||
|
@ -672,12 +672,14 @@ do { \
|
||||
/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */
|
||||
#define AARCH64_CALL_RATIO 8
|
||||
|
||||
/* When optimizing for size, give a better estimate of the length of a memcpy
|
||||
call, but use the default otherwise. But move_by_pieces_ninsns() counts
|
||||
memory-to-memory moves, and we'll have to generate a load & store for each,
|
||||
so halve the value to take that into account. */
|
||||
/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
|
||||
move_by_pieces will continually copy the largest safe chunks. So a
|
||||
7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient
|
||||
for both size and speed of copy, so we will instead use the "movmem"
|
||||
standard name to implement the copy. This logic does not apply when
|
||||
targeting -mstrict-align, so keep a sensible default in that case. */
|
||||
#define MOVE_RATIO(speed) \
|
||||
(((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
|
||||
(!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
|
||||
|
||||
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
|
||||
of the length of a memset call, but use the default otherwise. */
|
||||
|
@ -883,6 +883,24 @@
|
||||
}
|
||||
)
|
||||
|
||||
;; 0 is dst
|
||||
;; 1 is src
|
||||
;; 2 is size of move in bytes
|
||||
;; 3 is alignment
|
||||
|
||||
(define_expand "movmemdi"
|
||||
[(match_operand:BLK 0 "memory_operand")
|
||||
(match_operand:BLK 1 "memory_operand")
|
||||
(match_operand:DI 2 "immediate_operand")
|
||||
(match_operand:DI 3 "immediate_operand")]
|
||||
"!STRICT_ALIGNMENT"
|
||||
{
|
||||
if (aarch64_expand_movmem (operands))
|
||||
DONE;
|
||||
FAIL;
|
||||
}
|
||||
)
|
||||
|
||||
;; Operands 1 and 3 are tied together by the final condition; so we allow
|
||||
;; fairly lax checking on the second memory operation.
|
||||
(define_insn "load_pair<mode>"
|
||||
|
@ -1,3 +1,8 @@
|
||||
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
|
||||
|
||||
* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
|
||||
* gcc.dg/tree-ssa/sra-12.c: Likewise.
|
||||
|
||||
2014-06-06 Thomas Preud'homme <thomas.preudhomme@arm.com>
|
||||
|
||||
* gcc.c-torture/execute/bswap-2.c: Add alignment constraints to
|
||||
|
@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
|
||||
/* Whether the structs are totally scalarized or not depends on the
|
||||
MOVE_RATIO macro definition in the back end. The scalarization will
|
||||
not take place when using small values for MOVE_RATIO. */
|
||||
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
|
||||
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
|
||||
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
|
||||
/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
|
||||
/* { dg-final { cleanup-tree-dump "optimized" } } */
|
||||
|
@ -21,5 +21,5 @@ int foo (struct S *p)
|
||||
*p = l;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
|
||||
/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
|
||||
/* { dg-final { cleanup-tree-dump "release_ssa" } } */
|
||||
|
Loading…
Reference in New Issue
Block a user