oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com> libgomp/ * oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter. * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async' parameter, use to set async stream around call to gomp_unmap_vars, call gomp_unmap_vars() with 'do_copyfrom' set to true. * plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field. (event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP events and call GOMP_PLUGIN_async_unmap_vars() for each of them. (event_add): Add int parameter, initialize 'val' field when adding new ptx_event struct. (nvptx_evec): Adjust event_add() call arguments. (nvptx_host2dev): Likewise. (nvptx_dev2host): Likewise. (nvptx_wait_async): Likewise. (nvptx_wait_all_async): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter, pass to event_add() call. * oacc-host.c (host_openacc_register_async_cleanup): Add 'int async' parameter. * oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to call openacc.register_async_cleanup_func() hook. * oacc-parallel.c (GOACC_parallel_keyed): Likewise. * target.c (gomp_copy_from_async): Delete function. (gomp_map_vars): Remove async_refcount. (gomp_unmap_vars): Likewise. (gomp_load_image_to_device): Likewise. (omp_target_associate_ptr): Likewise. * libgomp.h (struct splay_tree_key_s): Remove async_refcount. (acc_dispatch_t.register_async_cleanup_func): Add int parameter. (gomp_copy_from_async): Remove. From-SVN: r236772
This commit is contained in:
parent
e79136e41a
commit
b4557008c4
@ -1,3 +1,35 @@
|
||||
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
|
||||
|
||||
* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
|
||||
* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
|
||||
parameter, use to set async stream around call to gomp_unmap_vars,
|
||||
call gomp_unmap_vars() with 'do_copyfrom' set to true.
|
||||
* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
|
||||
(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
|
||||
events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
|
||||
(event_add): Add int parameter, initialize 'val' field when
|
||||
adding new ptx_event struct.
|
||||
(nvptx_evec): Adjust event_add() call arguments.
|
||||
(nvptx_host2dev): Likewise.
|
||||
(nvptx_dev2host): Likewise.
|
||||
(nvptx_wait_async): Likewise.
|
||||
(nvptx_wait_all_async): Likewise.
|
||||
(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
|
||||
pass to event_add() call.
|
||||
* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
|
||||
parameter.
|
||||
* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
|
||||
call openacc.register_async_cleanup_func() hook.
|
||||
* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
|
||||
* target.c (gomp_copy_from_async): Delete function.
|
||||
(gomp_map_vars): Remove async_refcount.
|
||||
(gomp_unmap_vars): Likewise.
|
||||
(gomp_load_image_to_device): Likewise.
|
||||
(omp_target_associate_ptr): Likewise.
|
||||
* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
|
||||
(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
|
||||
(gomp_copy_from_async): Remove.
|
||||
|
||||
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
|
||||
|
||||
* target.c (gomp_device_copy): New function.
|
||||
|
@ -835,8 +835,6 @@ struct splay_tree_key_s {
|
||||
uintptr_t tgt_offset;
|
||||
/* Reference count. */
|
||||
uintptr_t refcount;
|
||||
/* Asynchronous reference count. */
|
||||
uintptr_t async_refcount;
|
||||
/* Pointer to the original mapping of "omp declare target link" object. */
|
||||
splay_tree_key link_key;
|
||||
};
|
||||
@ -872,7 +870,7 @@ typedef struct acc_dispatch_t
|
||||
unsigned *, void *);
|
||||
|
||||
/* Async cleanup callback registration. */
|
||||
void (*register_async_cleanup_func) (void *);
|
||||
void (*register_async_cleanup_func) (void *, int);
|
||||
|
||||
/* Asynchronous routines. */
|
||||
int (*async_test_func) (int);
|
||||
@ -977,7 +975,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
|
||||
size_t, void **, void **,
|
||||
size_t *, void *, bool,
|
||||
enum gomp_map_vars_kind);
|
||||
extern void gomp_copy_from_async (struct target_mem_desc *);
|
||||
extern void gomp_unmap_vars (struct target_mem_desc *, bool);
|
||||
extern void gomp_init_device (struct gomp_device_descr *);
|
||||
extern void gomp_free_memmap (struct splay_tree_s *);
|
||||
|
@ -148,7 +148,8 @@ host_openacc_exec (void (*fn) (void *),
|
||||
}
|
||||
|
||||
static void
|
||||
host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)))
|
||||
host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
|
||||
int async __attribute__ ((unused)))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -704,10 +704,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
|
||||
if (async < acc_async_noval)
|
||||
gomp_unmap_vars (t, true);
|
||||
else
|
||||
{
|
||||
gomp_copy_from_async (t);
|
||||
acc_dev->openacc.register_async_cleanup_func (t);
|
||||
}
|
||||
t->device_descr->openacc.register_async_cleanup_func (t, async);
|
||||
|
||||
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
|
||||
}
|
||||
|
@ -186,10 +186,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
|
||||
if (async < acc_async_noval)
|
||||
gomp_unmap_vars (tgt, true);
|
||||
else
|
||||
{
|
||||
gomp_copy_from_async (tgt);
|
||||
acc_dev->openacc.register_async_cleanup_func (tgt);
|
||||
}
|
||||
tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
|
||||
|
||||
acc_dev->openacc.async_set_async_func (acc_async_sync);
|
||||
}
|
||||
|
@ -31,11 +31,14 @@
|
||||
#include "oacc-int.h"
|
||||
|
||||
void
|
||||
GOMP_PLUGIN_async_unmap_vars (void *ptr)
|
||||
GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
|
||||
{
|
||||
struct target_mem_desc *tgt = ptr;
|
||||
struct gomp_device_descr *devicep = tgt->device_descr;
|
||||
|
||||
gomp_unmap_vars (tgt, false);
|
||||
devicep->openacc.async_set_async_func (async);
|
||||
gomp_unmap_vars (tgt, true);
|
||||
devicep->openacc.async_set_async_func (acc_async_sync);
|
||||
}
|
||||
|
||||
/* Return the target-specific part of the TLS data for the current thread. */
|
||||
|
@ -27,7 +27,7 @@
|
||||
#ifndef OACC_PLUGIN_H
|
||||
#define OACC_PLUGIN_H 1
|
||||
|
||||
extern void GOMP_PLUGIN_async_unmap_vars (void *);
|
||||
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
|
||||
extern void *GOMP_PLUGIN_acc_thread (void);
|
||||
|
||||
#endif
|
||||
|
@ -329,6 +329,7 @@ struct ptx_event
|
||||
int type;
|
||||
void *addr;
|
||||
int ord;
|
||||
int val;
|
||||
|
||||
struct ptx_event *next;
|
||||
};
|
||||
@ -789,6 +790,7 @@ static void
|
||||
event_gc (bool memmap_lockable)
|
||||
{
|
||||
struct ptx_event *ptx_event = ptx_events;
|
||||
struct ptx_event *async_cleanups = NULL;
|
||||
struct nvptx_thread *nvthd = nvptx_thread ();
|
||||
|
||||
pthread_mutex_lock (&ptx_event_lock);
|
||||
@ -806,6 +808,7 @@ event_gc (bool memmap_lockable)
|
||||
r = cuEventQuery (*e->evt);
|
||||
if (r == CUDA_SUCCESS)
|
||||
{
|
||||
bool append_async = false;
|
||||
CUevent *te;
|
||||
|
||||
te = e->evt;
|
||||
@ -830,7 +833,7 @@ event_gc (bool memmap_lockable)
|
||||
if (!memmap_lockable)
|
||||
continue;
|
||||
|
||||
GOMP_PLUGIN_async_unmap_vars (e->addr);
|
||||
append_async = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -838,6 +841,7 @@ event_gc (bool memmap_lockable)
|
||||
cuEventDestroy (*te);
|
||||
free ((void *)te);
|
||||
|
||||
/* Unlink 'e' from ptx_events list. */
|
||||
if (ptx_events == e)
|
||||
ptx_events = ptx_events->next;
|
||||
else
|
||||
@ -848,15 +852,31 @@ event_gc (bool memmap_lockable)
|
||||
e_->next = e_->next->next;
|
||||
}
|
||||
|
||||
free (e);
|
||||
if (append_async)
|
||||
{
|
||||
e->next = async_cleanups;
|
||||
async_cleanups = e;
|
||||
}
|
||||
else
|
||||
free (e);
|
||||
}
|
||||
}
|
||||
|
||||
pthread_mutex_unlock (&ptx_event_lock);
|
||||
|
||||
/* We have to do these here, after ptx_event_lock is released. */
|
||||
while (async_cleanups)
|
||||
{
|
||||
struct ptx_event *e = async_cleanups;
|
||||
async_cleanups = async_cleanups->next;
|
||||
|
||||
GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
|
||||
free (e);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
event_add (enum ptx_event_type type, CUevent *e, void *h)
|
||||
event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
|
||||
{
|
||||
struct ptx_event *ptx_event;
|
||||
struct nvptx_thread *nvthd = nvptx_thread ();
|
||||
@ -869,6 +889,7 @@ event_add (enum ptx_event_type type, CUevent *e, void *h)
|
||||
ptx_event->evt = e;
|
||||
ptx_event->addr = h;
|
||||
ptx_event->ord = nvthd->ptx_dev->ord;
|
||||
ptx_event->val = val;
|
||||
|
||||
pthread_mutex_lock (&ptx_event_lock);
|
||||
|
||||
@ -975,7 +996,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||
|
||||
CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
|
||||
|
||||
event_add (PTX_EVT_KNL, e, (void *)dev_str);
|
||||
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
||||
}
|
||||
#else
|
||||
r = cuCtxSynchronize ();
|
||||
@ -1071,7 +1092,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
|
||||
CUDA_CALL (cuMemcpyHtoDAsync,
|
||||
(CUdeviceptr) d, h, s, nvthd->current_stream->stream);
|
||||
CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
||||
event_add (PTX_EVT_MEM, e, (void *)h);
|
||||
event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@ -1127,7 +1148,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
|
||||
CUDA_CALL (cuMemcpyDtoHAsync,
|
||||
h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
|
||||
CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
|
||||
event_add (PTX_EVT_MEM, e, (void *)h);
|
||||
event_add (PTX_EVT_MEM, e, (void *)h, 0);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@ -1240,7 +1261,7 @@ nvptx_wait_async (int async1, int async2)
|
||||
|
||||
CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
|
||||
|
||||
event_add (PTX_EVT_SYNC, e, NULL);
|
||||
event_add (PTX_EVT_SYNC, e, NULL, 0);
|
||||
|
||||
CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
|
||||
}
|
||||
@ -1313,7 +1334,7 @@ nvptx_wait_all_async (int async)
|
||||
/* Record an event on the waited-for stream. */
|
||||
CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
|
||||
|
||||
event_add (PTX_EVT_SYNC, e, NULL);
|
||||
event_add (PTX_EVT_SYNC, e, NULL, 0);
|
||||
|
||||
CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
|
||||
}
|
||||
@ -1646,14 +1667,14 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
|
||||
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
|
||||
{
|
||||
struct nvptx_thread *nvthd = nvptx_thread ();
|
||||
CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
||||
|
||||
CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
||||
CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
|
||||
event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
|
||||
event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -707,7 +707,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
|
||||
tgt->list[i].offset = 0;
|
||||
tgt->list[i].length = k->host_end - k->host_start;
|
||||
k->refcount = 1;
|
||||
k->async_refcount = 0;
|
||||
tgt->refcount++;
|
||||
array->left = NULL;
|
||||
array->right = NULL;
|
||||
@ -854,43 +853,9 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
|
||||
free (tgt);
|
||||
}
|
||||
|
||||
/* Decrease the refcount for a set of mapped variables, and queue asychronous
|
||||
copies from the device back to the host after any work that has been issued.
|
||||
Because the regions are still "live", increment an asynchronous reference
|
||||
count to indicate that they should not be unmapped from host-side data
|
||||
structures until the asynchronous copy has completed. */
|
||||
|
||||
attribute_hidden void
|
||||
gomp_copy_from_async (struct target_mem_desc *tgt)
|
||||
{
|
||||
struct gomp_device_descr *devicep = tgt->device_descr;
|
||||
size_t i;
|
||||
|
||||
gomp_mutex_lock (&devicep->lock);
|
||||
|
||||
for (i = 0; i < tgt->list_count; i++)
|
||||
if (tgt->list[i].key == NULL)
|
||||
;
|
||||
else if (tgt->list[i].key->refcount > 1)
|
||||
{
|
||||
tgt->list[i].key->refcount--;
|
||||
tgt->list[i].key->async_refcount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
splay_tree_key k = tgt->list[i].key;
|
||||
if (tgt->list[i].copy_from)
|
||||
gomp_copy_dev2host (devicep, (void *) k->host_start,
|
||||
(void *) (k->tgt->tgt_start + k->tgt_offset),
|
||||
k->host_end - k->host_start);
|
||||
}
|
||||
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
}
|
||||
|
||||
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
|
||||
variables back from device to host: if it is false, it is assumed that this
|
||||
has been done already, i.e. by gomp_copy_from_async above. */
|
||||
has been done already. */
|
||||
|
||||
attribute_hidden void
|
||||
gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
|
||||
@ -924,13 +889,8 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
|
||||
k->refcount--;
|
||||
else if (k->refcount == 1)
|
||||
{
|
||||
if (k->async_refcount > 0)
|
||||
k->async_refcount--;
|
||||
else
|
||||
{
|
||||
k->refcount--;
|
||||
do_unmap = true;
|
||||
}
|
||||
k->refcount--;
|
||||
do_unmap = true;
|
||||
}
|
||||
|
||||
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
|
||||
@ -1076,7 +1036,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
|
||||
k->tgt = tgt;
|
||||
k->tgt_offset = target_table[i].start;
|
||||
k->refcount = REFCOUNT_INFINITY;
|
||||
k->async_refcount = 0;
|
||||
k->link_key = NULL;
|
||||
array->left = NULL;
|
||||
array->right = NULL;
|
||||
@ -1109,7 +1068,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
|
||||
k->tgt = tgt;
|
||||
k->tgt_offset = target_var->start;
|
||||
k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
|
||||
k->async_refcount = 0;
|
||||
k->link_key = NULL;
|
||||
array->left = NULL;
|
||||
array->right = NULL;
|
||||
@ -2332,7 +2290,6 @@ omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
|
||||
k->tgt = tgt;
|
||||
k->tgt_offset = (uintptr_t) device_ptr + device_offset;
|
||||
k->refcount = REFCOUNT_INFINITY;
|
||||
k->async_refcount = 0;
|
||||
array->left = NULL;
|
||||
array->right = NULL;
|
||||
splay_tree_insert (&devicep->mem_map, array);
|
||||
|
Loading…
Reference in New Issue
Block a user