Optimize tile (mostly tilegx) memcpy and memmove performance.

- Override <memcopy.h> so we use full 8-byte word copies on tilegx32
  for memmove, then use op_t in memcpy instead of the previous
  locally-defined word_t just to avoid proliferating identical types.
- Fix bug in memcpy prefetch that caused us to never prefetch past
  the first cache line.
- Optimize misaligned memcpy by inlining _wordcopy_fwd_dest_aligned
  instead of just doing a dumb word-at-a-time copy.
- Make memcpy safe for forward copies by doing all the loads from
  a given cache line prior to doing a wh64 (cache line zero-fill)
  on the destination.  Remove now-redundant src == dst check.
- Copy and optimize the generic wordcopy.c routines to use the tile
  "double align" instruction instead of the MERGE macro; to avoid
  offset addressing mode (which tile doesn't have) by rewriting the
  pointer math to load and store with a zero index; and to use
  post-increment addresses in the inner loops to improve scheduling.
This commit is contained in:
Chris Metcalf 2012-11-02 12:53:57 -04:00
parent 82477c28f4
commit cd84016efe
4 changed files with 613 additions and 65 deletions

View File

@ -1,3 +1,9 @@
2012-11-02 Chris Metcalf <cmetcalf@tilera.com>
* sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize.
* sysdeps/tile/memcopy.h: New file.
* sysdeps/tile/wordcopy.c: New file.
2012-11-03 Joseph Myers <joseph@codesourcery.com>
[BZ #3439]

View File

@ -0,0 +1,27 @@
/* memcopy.h -- definitions for memory copy functions. Tile version.
Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdeps/generic/memcopy.h>
#include <bits/wordsize.h>
/* Support more efficient copying on tilegx32, which supports
long long as a native 64-bit type. */
#if defined (__tilegx__) && __WORDSIZE == 32
# undef op_t
# define op_t unsigned long long int
#endif

View File

@ -19,11 +19,9 @@
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <memcopy.h>
#include <arch/chip.h>
/* Must be 8 bytes in size. */
#define word_t uint64_t
/* How many cache lines ahead should we prefetch? */
#define PREFETCH_LINES_AHEAD 3
@ -34,8 +32,8 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
const char *__restrict src1 = (const char *) srcv;
const char *__restrict src1_end;
const char *__restrict prefetch;
word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
word_t final; /* Final bytes to write to trailing word, if any */
op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
op_t final; /* Final bytes to write to trailing word, if any */
long i;
if (n < 16)
@ -55,101 +53,169 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
{
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
prefetch = (prefetch > src1_end) ? prefetch : src1;
prefetch = (prefetch < src1_end) ? prefetch : src1;
}
/* Copy bytes until dst is word-aligned. */
for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
*dst1++ = *src1++;
/* 8-byte pointer to destination memory. */
dst8 = (word_t *) dst1;
dst8 = (op_t *) dst1;
if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
{
/* Misaligned copy. Copy 8 bytes at a time, but don't bother
with other fanciness.
TODO: Consider prefetching and using wh64 as well. */
/* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but
inline it to avoid prologue/epilogue. TODO: Consider
prefetching and using wh64 as well. */
void * srci;
op_t a0, a1, a2, a3;
long int dstp = (long int) dst1;
long int srcp = (long int) src1;
long int len = n / OPSIZ;
/* Create an aligned src8. */
const word_t *__restrict src8 =
(const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
word_t b;
/* Save the initial source pointer so we know the number of
bytes to shift for merging two unaligned results. */
srci = (void *) srcp;
word_t a = *src8++;
for (; n >= sizeof (word_t); n -= sizeof (word_t))
{
b = *src8++;
a = __insn_dblalign (a, b, src1);
*dst8++ = a;
a = b;
}
/* Make SRCP aligned by rounding it down to the beginning of the
`op_t' it points in the middle of. */
srcp &= -OPSIZ;
switch (len % 4)
{
case 2:
a1 = ((op_t *) srcp)[0];
a2 = ((op_t *) srcp)[1];
len += 2;
srcp += 2 * OPSIZ;
goto do1;
case 3:
a0 = ((op_t *) srcp)[0];
a1 = ((op_t *) srcp)[1];
len += 1;
srcp += 2 * OPSIZ;
goto do2;
case 0:
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
return dstv;
a3 = ((op_t *) srcp)[0];
a0 = ((op_t *) srcp)[1];
len += 0;
srcp += 2 * OPSIZ;
goto do3;
case 1:
a2 = ((op_t *) srcp)[0];
a3 = ((op_t *) srcp)[1];
srcp += 2 * OPSIZ;
len -= 1;
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
goto do0;
goto do4; /* No-op. */
}
do
{
do4:
a0 = ((op_t *) srcp)[0];
a2 = __insn_dblalign (a2, a3, srci);
((op_t *) dstp)[0] = a2;
srcp += OPSIZ;
dstp += OPSIZ;
do3:
a1 = ((op_t *) srcp)[0];
a3 = __insn_dblalign (a3, a0, srci);
((op_t *) dstp)[0] = a3;
srcp += OPSIZ;
dstp += OPSIZ;
do2:
a2 = ((op_t *) srcp)[0];
a0 = __insn_dblalign (a0, a1, srci);
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
do1:
a3 = ((op_t *) srcp)[0];
a1 = __insn_dblalign (a1, a2, srci);
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
len -= 4;
}
while (len != 0);
/* This is the right position for do0. Please don't move
it into the loop. */
do0:
((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
n = n % OPSIZ;
if (n == 0)
return dstv;
return dstv;
b = ((const char *) src8 <= src1_end) ? *src8 : 0;
a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
/* Final source bytes to write to trailing partial word, if any. */
final = __insn_dblalign (a, b, src1);
final = __insn_dblalign (a3, a0, srci);
dst8 = (op_t *)(dstp + OPSIZ);
}
else
{
/* Aligned copy. */
const word_t *__restrict src8 = (const word_t *) src1;
const op_t *__restrict src8 = (const op_t *) src1;
/* src8 and dst8 are both word-aligned. */
if (n >= CHIP_L2_LINE_SIZE ())
{
/* Copy until 'dst' is cache-line-aligned. */
for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
n -= sizeof (word_t))
n -= sizeof (op_t))
*dst8++ = *src8++;
/* If copying to self, return. The test is cheap enough
that we do it despite the fact that the memcpy() contract
doesn't require us to support overlapping dst and src.
This is the most common case of overlap, and any close
overlap will cause corruption due to the wh64 below.
This case is particularly important since the compiler
will emit memcpy() calls for aggregate copies even if it
can't prove that src != dst. */
if (__builtin_expect (dst8 == src8, 0))
return dstv;
for (; n >= CHIP_L2_LINE_SIZE ();)
{
__insn_wh64 (dst8);
{
op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
/* Prefetch and advance to next line to prefetch, but
don't go past the end. */
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
prefetch = (prefetch > src1_end) ? prefetch :
(const char *) src8;
/* Prefetch and advance to next line to prefetch, but
don't go past the end. */
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
prefetch = (prefetch < src1_end) ? prefetch :
(const char *) src8;
/* Copy an entire cache line. Manually unrolled to
avoid idiosyncracies of compiler unrolling. */
#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
COPY_WORD (0);
COPY_WORD (1);
COPY_WORD (2);
COPY_WORD (3);
COPY_WORD (4);
COPY_WORD (5);
COPY_WORD (6);
COPY_WORD (7);
/* Do all the loads before wh64. This is necessary if
[src8, src8+7] and [dst8, dst8+7] share the same
cache line and dst8 <= src8, as can be the case when
called from memmove, or with code tested on x86 whose
memcpy always works with forward copies. */
tmp0 = *src8++;
tmp1 = *src8++;
tmp2 = *src8++;
tmp3 = *src8++;
tmp4 = *src8++;
tmp5 = *src8++;
tmp6 = *src8++;
tmp7 = *src8++;
__insn_wh64 (dst8);
*dst8++ = tmp0;
*dst8++ = tmp1;
*dst8++ = tmp2;
*dst8++ = tmp3;
*dst8++ = tmp4;
*dst8++ = tmp5;
*dst8++ = tmp6;
*dst8++ = tmp7;
n -= 64;
}
#if CHIP_L2_LINE_SIZE() != 64
# error "Fix code that assumes particular L2 cache line size."
#endif
dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
}
}
for (; n >= sizeof (word_t); n -= sizeof (word_t))
for (; n >= sizeof (op_t); n -= sizeof (op_t))
*dst8++ = *src8++;
if (__builtin_expect (n == 0, 1))

View File

@ -0,0 +1,449 @@
/* wordcopy.c -- subroutines for memory copy functions. Tile version.
Copyright (C) 1991-2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* To optimize for tile, we make the following changes from the
default glibc version:
- Use the double align instruction instead of the MERGE macro.
- Since we don't have offset addressing mode, make sure the loads /
stores in the inner loop always have indices of 0.
- Use post-increment addresses in the inner loops, which yields
better scheduling. */
/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */
#include <stddef.h>
#include <memcopy.h>
/* Provide the appropriate dblalign builtin to shift two registers
based on the alignment of a pointer held in a third register. */
#ifdef __tilegx__
#define DBLALIGN __insn_dblalign
#else
#define DBLALIGN __insn_dword_align
#endif
/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
void
_wordcopy_fwd_aligned (dstp, srcp, len)
long int dstp;
long int srcp;
size_t len;
{
op_t a0, a1;
switch (len % 8)
{
case 2:
a0 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 6;
goto do1;
case 3:
a1 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 5;
goto do2;
case 4:
a0 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 4;
goto do3;
case 5:
a1 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 3;
goto do4;
case 6:
a0 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 2;
goto do5;
case 7:
a1 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len += 1;
goto do6;
case 0:
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
return;
a0 = ((op_t *) srcp)[0];
srcp += OPSIZ;
goto do7;
case 1:
a1 = ((op_t *) srcp)[0];
srcp += OPSIZ;
len -= 1;
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
goto do0;
goto do8; /* No-op. */
}
do
{
do8:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
do7:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
do6:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
do5:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
do4:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
do3:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
do2:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
do1:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
len -= 8;
}
while (len != 0);
/* This is the right position for do0. Please don't move
it into the loop. */
do0:
((op_t *) dstp)[0] = a1;
}
/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
DSTP should be aligned for memory operations on `op_t's, but SRCP must
*not* be aligned. */
void
_wordcopy_fwd_dest_aligned (dstp, srcp, len)
long int dstp;
long int srcp;
size_t len;
{
void * srci;
op_t a0, a1, a2, a3;
/* Save the initial source pointer so we know the number of bytes to
shift for merging two unaligned results. */
srci = (void *) srcp;
/* Make SRCP aligned by rounding it down to the beginning of the `op_t'
it points in the middle of. */
srcp &= -OPSIZ;
switch (len % 4)
{
case 2:
a1 = ((op_t *) srcp)[0];
a2 = ((op_t *) srcp)[1];
len += 2;
srcp += 2 * OPSIZ;
goto do1;
case 3:
a0 = ((op_t *) srcp)[0];
a1 = ((op_t *) srcp)[1];
len += 1;
srcp += 2 * OPSIZ;
goto do2;
case 0:
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
return;
a3 = ((op_t *) srcp)[0];
a0 = ((op_t *) srcp)[1];
len += 0;
srcp += 2 * OPSIZ;
goto do3;
case 1:
a2 = ((op_t *) srcp)[0];
a3 = ((op_t *) srcp)[1];
srcp += 2 * OPSIZ;
len -= 1;
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
goto do0;
goto do4; /* No-op. */
}
do
{
do4:
a0 = ((op_t *) srcp)[0];
a2 = DBLALIGN (a2, a3, srci);
((op_t *) dstp)[0] = a2;
srcp += OPSIZ;
dstp += OPSIZ;
do3:
a1 = ((op_t *) srcp)[0];
a3 = DBLALIGN (a3, a0, srci);
((op_t *) dstp)[0] = a3;
srcp += OPSIZ;
dstp += OPSIZ;
do2:
a2 = ((op_t *) srcp)[0];
a0 = DBLALIGN (a0, a1, srci);
((op_t *) dstp)[0] = a0;
srcp += OPSIZ;
dstp += OPSIZ;
do1:
a3 = ((op_t *) srcp)[0];
a1 = DBLALIGN (a1, a2, srci);
((op_t *) dstp)[0] = a1;
srcp += OPSIZ;
dstp += OPSIZ;
len -= 4;
}
while (len != 0);
/* This is the right position for do0. Please don't move
it into the loop. */
do0:
((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
}
/* _wordcopy_bwd_aligned -- Copy block finishing right before
SRCP to block finishing right before DSTP with LEN `op_t' words
(not LEN bytes!). Both SRCP and DSTP should be aligned for memory
operations on `op_t's. */
void
_wordcopy_bwd_aligned (dstp, srcp, len)
long int dstp;
long int srcp;
size_t len;
{
op_t a0, a1;
long int srcp1;
srcp1 = srcp - 1 * OPSIZ;
srcp -= 2 * OPSIZ;
dstp -= 1 * OPSIZ;
switch (len % 8)
{
case 2:
a0 = ((op_t *) srcp1)[0];
len += 6;
goto do1;
case 3:
a1 = ((op_t *) srcp1)[0];
len += 5;
goto do2;
case 4:
a0 = ((op_t *) srcp1)[0];
len += 4;
goto do3;
case 5:
a1 = ((op_t *) srcp1)[0];
len += 3;
goto do4;
case 6:
a0 = ((op_t *) srcp1)[0];
len += 2;
goto do5;
case 7:
a1 = ((op_t *) srcp1)[0];
len += 1;
goto do6;
case 0:
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
return;
a0 = ((op_t *) srcp1)[0];
goto do7;
case 1:
a1 = ((op_t *) srcp1)[0];
len -= 1;
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
goto do0;
goto do8; /* No-op. */
}
do
{
do8:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp -= OPSIZ;
dstp -= OPSIZ;
do7:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp -= OPSIZ;
dstp -= OPSIZ;
do6:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp -= OPSIZ;
dstp -= OPSIZ;
do5:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp -= OPSIZ;
dstp -= OPSIZ;
do4:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp -= OPSIZ;
dstp -= OPSIZ;
do3:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp -= OPSIZ;
dstp -= OPSIZ;
do2:
a0 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a1;
srcp -= OPSIZ;
dstp -= OPSIZ;
do1:
a1 = ((op_t *) srcp)[0];
((op_t *) dstp)[0] = a0;
srcp -= OPSIZ;
dstp -= OPSIZ;
len -= 8;
}
while (len != 0);
/* This is the right position for do0. Please don't move
it into the loop. */
do0:
((op_t *) dstp)[0] = a1;
}
/* _wordcopy_bwd_dest_aligned -- Copy block finishing right
before SRCP to block finishing right before DSTP with LEN `op_t'
words (not LEN bytes!). DSTP should be aligned for memory
operations on `op_t', but SRCP must *not* be aligned. */
void
_wordcopy_bwd_dest_aligned (dstp, srcp, len)
long int dstp;
long int srcp;
size_t len;
{
void * srci;
op_t a0, a1, a2, a3;
op_t b0, b1, b2, b3;
/* Save the initial source pointer so we know the number of bytes to
shift for merging two unaligned results. */
srci = (void *) srcp;
/* Make SRCP aligned by rounding it down to the beginning of the op_t
it points in the middle of. */
srcp &= -OPSIZ;
srcp += OPSIZ;
switch (len % 4)
{
case 2:
srcp -= 3 * OPSIZ;
dstp -= 1 * OPSIZ;
b2 = ((op_t *) srcp)[2];
b1 = a1 = ((op_t *) srcp)[1];
len += 2;
goto do1;
case 3:
srcp -= 3 * OPSIZ;
dstp -= 1 * OPSIZ;
b3 = ((op_t *) srcp)[2];
b2 = a2 = ((op_t *) srcp)[1];
len += 1;
goto do2;
case 0:
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
return;
srcp -= 3 * OPSIZ;
dstp -= 1 * OPSIZ;
b0 = ((op_t *) srcp)[2];
b3 = a3 = ((op_t *) srcp)[1];
goto do3;
case 1:
srcp -= 3 * OPSIZ;
dstp -= 1 * OPSIZ;
b1 = ((op_t *) srcp)[2];
b0 = a0 = ((op_t *) srcp)[1];
len -= 1;
if (OP_T_THRES <= 3 * OPSIZ && len == 0)
goto do0;
goto do4; /* No-op. */
}
do
{
do4:
b3 = a3 = ((op_t *) srcp)[0];
a0 = DBLALIGN (a0, b1, srci);
((op_t *) dstp)[0] = a0;
srcp -= OPSIZ;
dstp -= OPSIZ;
do3:
b2 = a2 = ((op_t *) srcp)[0];
a3 = DBLALIGN (a3, b0, srci);
((op_t *) dstp)[0] = a3;
srcp -= OPSIZ;
dstp -= OPSIZ;
do2:
b1 = a1 = ((op_t *) srcp)[0];
a2 = DBLALIGN (a2, b3, srci);
((op_t *) dstp)[0] = a2;
srcp -= OPSIZ;
dstp -= OPSIZ;
do1:
b0 = a0 = ((op_t *) srcp)[0];
a1 = DBLALIGN (a1, b2, srci);
((op_t *) dstp)[0] = a1;
srcp -= OPSIZ;
dstp -= OPSIZ;
len -= 4;
}
while (len != 0);
/* This is the right position for do0. Please don't move
it into the loop. */
do0:
a0 = DBLALIGN (a0, b1, srci);
((op_t *) dstp)[0] = a0;
}