target.c (struct gomp_coalesce_buf): New type.

* target.c (struct gomp_coalesce_buf): New type.
	(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
	(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
	(gomp_copy_host2dev): Add CBUF argument, if copying into
	the cached ranges, memcpy into buffer instead of copying
	into device.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
	Add CBUF argument, pass it through to other calls.
	(gomp_map_vars): Aggregate copies from host to device if small enough
	and with small enough gaps in between into memcpy into a buffer and
	fewer host to device copies from the buffer.
	(gomp_update): Adjust gomp_copy_host2dev caller.

From-SVN: r254194
This commit is contained in:
Jakub Jelinek 2017-10-28 09:02:39 +02:00 committed by Jakub Jelinek
parent fdfcd5ecc5
commit 7324369a12
2 changed files with 207 additions and 34 deletions

View File

@ -1,3 +1,18 @@
2017-10-28 Jakub Jelinek <jakub@redhat.com>
* target.c (struct gomp_coalesce_buf): New type.
(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
(gomp_copy_host2dev): Add CBUF argument, if copying into
the cached ranges, memcpy into buffer instead of copying
into device.
(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
Add CBUF argument, pass it through to other calls.
(gomp_map_vars): Aggregate copies from host to device if small enough
and with small enough gaps in between into memcpy into a buffer and
fewer host to device copies from the buffer.
(gomp_update): Adjust gomp_copy_host2dev caller.
2017-10-17 Thomas Schwinge <thomas@codesourcery.com>
* testsuite/libgomp.oacc-fortran/declare-1.f90: Restore "dg-do

View File

@ -177,10 +177,122 @@ gomp_device_copy (struct gomp_device_descr *devicep,
}
}
/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
host to device memory transfers. */
struct gomp_coalesce_buf
{
/* Buffer into which gomp_copy_host2dev will memcpy data and from which
it will be copied to the device. */
void *buf;
struct target_mem_desc *tgt;
/* Array with offsets, chunks[2 * i] is the starting offset and
chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
of chunks which are to be copied to buf and later copied to device. */
size_t *chunks;
/* Number of chunks in chunks array, or -1 if coalesce buffering should not
be performed. */
long chunk_cnt;
/* During construction of chunks array, how many memory regions are within
the last chunk. If there is just one memory region for a chunk, we copy
it directly to device rather than going through buf. */
long use_cnt;
};
/* Maximum size of memory region considered for coalescing. Larger copies
are performed directly. */
#define MAX_COALESCE_BUF_SIZE (32 * 1024)
/* Maximum size of a gap in between regions to consider them being copied
within the same chunk. All the device offsets considered are within
newly allocated device memory, so it isn't fatal if we copy some padding
in between from host to device. The gaps come either from alignment
padding or from memory regions which are not supposed to be copied from
host to device (e.g. map(alloc:), map(from:) etc.). */
#define MAX_COALESCE_BUF_GAP (4 * 1024)
/* Add region with device tgt_start relative offset and length to CBUF. */
static inline void
gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
{
if (len > MAX_COALESCE_BUF_SIZE || len == 0)
return;
if (cbuf->chunk_cnt)
{
if (cbuf->chunk_cnt < 0)
return;
if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
{
cbuf->chunk_cnt = -1;
return;
}
if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
{
cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
cbuf->use_cnt++;
return;
}
/* If the last chunk is only used by one mapping, discard it,
as it will be one host to device copy anyway and
memcpying it around will only waste cycles. */
if (cbuf->use_cnt == 1)
cbuf->chunk_cnt--;
}
cbuf->chunks[2 * cbuf->chunk_cnt] = start;
cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
cbuf->chunk_cnt++;
cbuf->use_cnt = 1;
}
/* Return true for mapping kinds which need to copy data from the
host to device for regions that weren't previously mapped. */
static inline bool
gomp_to_device_kind_p (int kind)
{
switch (kind)
{
case GOMP_MAP_ALLOC:
case GOMP_MAP_FROM:
case GOMP_MAP_FORCE_ALLOC:
case GOMP_MAP_ALWAYS_FROM:
return false;
default:
return true;
}
}
static void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
void *d, const void *h, size_t sz)
void *d, const void *h, size_t sz,
struct gomp_coalesce_buf *cbuf)
{
if (cbuf)
{
uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
{
long first = 0;
long last = cbuf->chunk_cnt - 1;
while (first <= last)
{
long middle = (first + last) >> 1;
if (cbuf->chunks[2 * middle + 1] <= doff)
first = middle + 1;
else if (cbuf->chunks[2 * middle] <= doff)
{
if (doff + sz > cbuf->chunks[2 * middle + 1])
gomp_fatal ("internal libgomp cbuf error");
memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
h, sz);
return;
}
else
last = middle - 1;
}
}
}
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
@ -208,7 +320,7 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
static inline void
gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
splay_tree_key newn, struct target_var_desc *tgt_var,
unsigned char kind)
unsigned char kind, struct gomp_coalesce_buf *cbuf)
{
tgt_var->key = oldn;
tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@ -232,7 +344,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
newn->host_end - newn->host_start);
newn->host_end - newn->host_start, cbuf);
if (oldn->refcount != REFCOUNT_INFINITY)
oldn->refcount++;
@ -247,7 +359,8 @@ get_kind (bool short_mapkind, void *kinds, int idx)
static void
gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
uintptr_t target_offset, uintptr_t bias)
uintptr_t target_offset, uintptr_t bias,
struct gomp_coalesce_buf *cbuf)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
@ -257,11 +370,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
if (cur_node.host_start == (uintptr_t) NULL)
{
cur_node.tgt_offset = (uintptr_t) NULL;
/* FIXME: see comment about coalescing host/dev transfers below. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset,
sizeof (void *));
sizeof (void *), cbuf);
return;
}
/* Add bias to the pointer value. */
@ -280,15 +392,15 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
array section. Now subtract bias to get what we want
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
/* FIXME: see comment about coalescing host/dev transfers below. */
gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset, sizeof (void *));
(void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
}
static void
gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
size_t first, size_t i, void **hostaddrs,
size_t *sizes, void *kinds)
size_t *sizes, void *kinds,
struct gomp_coalesce_buf *cbuf)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
@ -306,7 +418,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node,
&tgt->list[i], kind & typemask);
&tgt->list[i], kind & typemask, cbuf);
return;
}
if (sizes[i] == 0)
@ -322,7 +434,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
== n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
kind & typemask);
kind & typemask, cbuf);
return;
}
}
@ -334,7 +446,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
kind & typemask);
kind & typemask, cbuf);
return;
}
}
@ -381,6 +493,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->list_count = mapnum;
tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
tgt->device_descr = devicep;
struct gomp_coalesce_buf cbuf, *cbufp = NULL;
if (mapnum == 0)
{
@ -391,11 +504,25 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt_align = sizeof (void *);
tgt_size = 0;
cbuf.chunks = NULL;
cbuf.chunk_cnt = -1;
cbuf.use_cnt = 0;
cbuf.buf = NULL;
if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
{
cbuf.chunks
= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
cbuf.chunk_cnt = 0;
}
if (pragma_kind == GOMP_MAP_VARS_TARGET)
{
size_t align = 4 * sizeof (void *);
tgt_align = align;
tgt_size = mapnum * sizeof (void *);
cbuf.chunk_cnt = 1;
cbuf.use_cnt = 1 + (mapnum > 1);
cbuf.chunks[0] = 0;
cbuf.chunks[1] = tgt_size;
}
gomp_mutex_lock (&devicep->lock);
@ -449,19 +576,26 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
size_t align = (size_t) 1 << (kind >> rshift);
if (tgt_align < align)
tgt_align = align;
tgt_size -= (uintptr_t) hostaddrs[first]
- (uintptr_t) hostaddrs[i];
tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
tgt_size += cur_node.host_end - cur_node.host_start;
not_found_cnt += last - i;
for (i = first; i <= last; i++)
tgt->list[i].key = NULL;
{
tgt->list[i].key = NULL;
if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
& typemask))
gomp_coalesce_buf_add (&cbuf,
tgt_size - cur_node.host_end
+ (uintptr_t) hostaddrs[i],
sizes[i]);
}
i--;
continue;
}
for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
sizes, kinds);
sizes, kinds, NULL);
i--;
continue;
}
@ -485,6 +619,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
gomp_coalesce_buf_add (&cbuf, tgt_size,
cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
has_firstprivate = true;
continue;
@ -504,7 +640,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
n = splay_tree_lookup (mem_map, &cur_node);
if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
kind & typemask);
kind & typemask, NULL);
else
{
tgt->list[i].key = NULL;
@ -514,6 +650,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
if (gomp_to_device_kind_p (kind & typemask))
gomp_coalesce_buf_add (&cbuf, tgt_size,
cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
if ((kind & typemask) == GOMP_MAP_TO_PSET)
{
@ -562,6 +701,19 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->tgt_start = (uintptr_t) tgt->to_free;
tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
tgt->tgt_end = tgt->tgt_start + tgt_size;
if (cbuf.use_cnt == 1)
cbuf.chunk_cnt--;
if (cbuf.chunk_cnt > 0)
{
cbuf.buf
= malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
if (cbuf.buf)
{
cbuf.tgt = tgt;
cbufp = &cbuf;
}
}
}
else
{
@ -600,7 +752,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
len = sizes[i];
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + tgt_size),
(void *) hostaddrs[i], len);
(void *) hostaddrs[i], len, cbufp);
tgt_size += len;
continue;
case GOMP_MAP_FIRSTPRIVATE_INT:
@ -633,7 +785,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
}
for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
sizes, kinds);
sizes, kinds, cbufp);
i--;
continue;
case GOMP_MAP_ALWAYS_POINTER:
@ -658,7 +810,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
+ cur_node.host_start
- n->host_start),
(void *) &cur_node.tgt_offset,
sizeof (void *));
sizeof (void *), cbufp);
cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start - n->host_start;
continue;
@ -674,7 +826,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
splay_tree_key n = splay_tree_lookup (mem_map, k);
if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
kind & typemask);
kind & typemask, cbufp);
else
{
k->link_key = NULL;
@ -725,26 +877,22 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_TO:
case GOMP_MAP_ALWAYS_TOFROM:
/* FIXME: Perhaps add some smarts, like if copying
several adjacent fields from host to target, use some
host buffer to avoid sending each var individually. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
k->host_end - k->host_start);
k->host_end - k->host_start, cbufp);
break;
case GOMP_MAP_POINTER:
gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
k->tgt_offset, sizes[i]);
k->tgt_offset, sizes[i], cbufp);
break;
case GOMP_MAP_TO_PSET:
/* FIXME: see above FIXME comment. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
k->host_end - k->host_start);
k->host_end - k->host_start, cbufp);
for (j = i + 1; j < mapnum; j++)
if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@ -767,7 +915,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
k->tgt_offset
+ ((uintptr_t) hostaddrs[j]
- k->host_start),
sizes[j]);
sizes[j], cbufp);
i++;
}
break;
@ -795,7 +943,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
sizeof (void *));
sizeof (void *), cbufp);
break;
default:
gomp_mutex_unlock (&devicep->lock);
@ -822,13 +970,23 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
for (i = 0; i < mapnum; i++)
{
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
/* FIXME: see above FIXME comment. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *));
(void *) &cur_node.tgt_offset, sizeof (void *),
cbufp);
}
}
if (cbufp)
{
long c = 0;
for (c = 0; c < cbuf.chunk_cnt; ++c)
gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
(char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
free (cbuf.buf);
}
/* If the variable from "omp target enter data" map-list was already mapped,
tgt is not needed. Otherwise tgt will be freed by gomp_unmap_vars or
gomp_exit_data. */
@ -970,7 +1128,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t size = cur_node.host_end - cur_node.host_start;
if (GOMP_MAP_COPY_TO_P (kind & typemask))
gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
}