S/390: Unroll mvc loop for memcpy with small constant lengths.

See the memset unrolling patch.  The very same applies to memcpys with
constant lengths.

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* config/s390/s390.c (s390_expand_movmem): Unroll MVC loop for
	small constant length operands.

gcc/testsuite/ChangeLog:

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* gcc.target/s390/memcpy-1.c: New test.

From-SVN: r244098
This commit is contained in:
Andreas Krebbel 2017-01-05 10:03:01 +00:00 committed by Andreas Krebbel
parent 8597cd335e
commit f5a537e390
3 changed files with 75 additions and 3 deletions

View File

@ -5246,10 +5246,25 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
&& (GET_CODE (len) != CONST_INT || INTVAL (len) > (1<<16)))
return false;
if (GET_CODE (len) == CONST_INT && INTVAL (len) >= 0 && INTVAL (len) <= 256)
/* Expand memcpy for constant length operands without a loop if it
is shorter that way.
With a constant length argument a
memcpy loop (without pfd) is 36 bytes -> 6 * mvc */
if (GET_CODE (len) == CONST_INT
&& INTVAL (len) >= 0
&& INTVAL (len) <= 256 * 6
&& (!TARGET_MVCLE || INTVAL (len) <= 256))
{
if (INTVAL (len) > 0)
emit_insn (gen_movmem_short (dst, src, GEN_INT (INTVAL (len) - 1)));
HOST_WIDE_INT o, l;
for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256)
{
rtx newdst = adjust_address (dst, BLKmode, o);
rtx newsrc = adjust_address (src, BLKmode, o);
emit_insn (gen_movmem_short (newdst, newsrc,
GEN_INT (l > 256 ? 255 : l - 1)));
}
}
else if (TARGET_MVCLE)

View File

@ -1,3 +1,7 @@
2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
* gcc.target/s390/memcpy-1.c: New test.
2017-01-04 Jeff Law <law@redhat.com>
PR tree-optimization/78812

View File

@ -0,0 +1,53 @@
/* Make sure that short memcpy's with constant length are emitted
without loop statements. */
/* { dg-do compile } */
/* { dg-options "-O3 -mzarch" } */
/* 3 MVCs */
void
*memcpy1(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 700);
}
/* NOP */
void
*memcpy2(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 0);
}
/* 1 MVC */
void
*memcpy3(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 256);
}
/* 2 MVCs */
void
*memcpy4(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 512);
}
/* 3 MVCs */
void
*memcpy5(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 768);
}
/* Loop with 2 MVCs */
void
*memcpy6(void *dest, const void *src)
{
return __builtin_memcpy (dest, src, 1537);
}
/* memcpy6 uses a loop - check for the two load address instructions
used to increment src and dest. */
/* { dg-final { scan-assembler-times "la" 2 } } */
/* { dg-final { scan-assembler-times "mvc" 11 } } */