oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.

2016-05-26  Chung-Lin Tang  <cltang@codesourcery.com>

	libgomp/
	* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
	parameter, use to set async stream around call to gomp_unmap_vars,
	call gomp_unmap_vars() with 'do_copyfrom' set to true.
	* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
	(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
	events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
	(event_add): Add int parameter, initialize 'val' field when
	adding new ptx_event struct.
	(nvptx_evec): Adjust event_add() call arguments.
	(nvptx_host2dev): Likewise.
	(nvptx_dev2host): Likewise.
	(nvptx_wait_async): Likewise.
	(nvptx_wait_all_async): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
	pass to event_add() call.
	* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
	parameter.
	* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
	call openacc.register_async_cleanup_func() hook.
	* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
	* target.c (gomp_copy_from_async): Delete function.
	(gomp_map_vars): Remove async_refcount.
	(gomp_unmap_vars): Likewise.
	(gomp_load_image_to_device): Likewise.
	(omp_target_associate_ptr): Likewise.
	* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
	(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
	(gomp_copy_from_async): Remove.

From-SVN: r236772
This commit is contained in:
Chung-Lin Tang 2016-05-26 13:28:25 +00:00 committed by Chung-Lin Tang
parent e79136e41a
commit b4557008c4
9 changed files with 77 additions and 72 deletions

View File

@ -1,3 +1,35 @@
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
parameter, use to set async stream around call to gomp_unmap_vars,
call gomp_unmap_vars() with 'do_copyfrom' set to true.
* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
(event_add): Add int parameter, initialize 'val' field when
adding new ptx_event struct.
(nvptx_evec): Adjust event_add() call arguments.
(nvptx_host2dev): Likewise.
(nvptx_dev2host): Likewise.
(nvptx_wait_async): Likewise.
(nvptx_wait_all_async): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
pass to event_add() call.
* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
parameter.
* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
call openacc.register_async_cleanup_func() hook.
* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
* target.c (gomp_copy_from_async): Delete function.
(gomp_map_vars): Remove async_refcount.
(gomp_unmap_vars): Likewise.
(gomp_load_image_to_device): Likewise.
(omp_target_associate_ptr): Likewise.
* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
(gomp_copy_from_async): Remove.
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
* target.c (gomp_device_copy): New function.

View File

@ -835,8 +835,6 @@ struct splay_tree_key_s {
uintptr_t tgt_offset;
/* Reference count. */
uintptr_t refcount;
/* Asynchronous reference count. */
uintptr_t async_refcount;
/* Pointer to the original mapping of "omp declare target link" object. */
splay_tree_key link_key;
};
@ -872,7 +870,7 @@ typedef struct acc_dispatch_t
unsigned *, void *);
/* Async cleanup callback registration. */
void (*register_async_cleanup_func) (void *);
void (*register_async_cleanup_func) (void *, int);
/* Asynchronous routines. */
int (*async_test_func) (int);
@ -977,7 +975,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
size_t, void **, void **,
size_t *, void *, bool,
enum gomp_map_vars_kind);
extern void gomp_copy_from_async (struct target_mem_desc *);
extern void gomp_unmap_vars (struct target_mem_desc *, bool);
extern void gomp_init_device (struct gomp_device_descr *);
extern void gomp_free_memmap (struct splay_tree_s *);

View File

@ -148,7 +148,8 @@ host_openacc_exec (void (*fn) (void *),
}
static void
host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)))
host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
int async __attribute__ ((unused)))
{
}

View File

@ -704,10 +704,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
if (async < acc_async_noval)
gomp_unmap_vars (t, true);
else
{
gomp_copy_from_async (t);
acc_dev->openacc.register_async_cleanup_func (t);
}
t->device_descr->openacc.register_async_cleanup_func (t, async);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
}

View File

@ -186,10 +186,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
if (async < acc_async_noval)
gomp_unmap_vars (tgt, true);
else
{
gomp_copy_from_async (tgt);
acc_dev->openacc.register_async_cleanup_func (tgt);
}
tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
acc_dev->openacc.async_set_async_func (acc_async_sync);
}

View File

@ -31,11 +31,14 @@
#include "oacc-int.h"
void
GOMP_PLUGIN_async_unmap_vars (void *ptr)
GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
{
struct target_mem_desc *tgt = ptr;
struct gomp_device_descr *devicep = tgt->device_descr;
gomp_unmap_vars (tgt, false);
devicep->openacc.async_set_async_func (async);
gomp_unmap_vars (tgt, true);
devicep->openacc.async_set_async_func (acc_async_sync);
}
/* Return the target-specific part of the TLS data for the current thread. */

View File

@ -27,7 +27,7 @@
#ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1
extern void GOMP_PLUGIN_async_unmap_vars (void *);
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern void *GOMP_PLUGIN_acc_thread (void);
#endif

View File

@ -329,6 +329,7 @@ struct ptx_event
int type;
void *addr;
int ord;
int val;
struct ptx_event *next;
};
@ -789,6 +790,7 @@ static void
event_gc (bool memmap_lockable)
{
struct ptx_event *ptx_event = ptx_events;
struct ptx_event *async_cleanups = NULL;
struct nvptx_thread *nvthd = nvptx_thread ();
pthread_mutex_lock (&ptx_event_lock);
@ -806,6 +808,7 @@ event_gc (bool memmap_lockable)
r = cuEventQuery (*e->evt);
if (r == CUDA_SUCCESS)
{
bool append_async = false;
CUevent *te;
te = e->evt;
@ -830,7 +833,7 @@ event_gc (bool memmap_lockable)
if (!memmap_lockable)
continue;
GOMP_PLUGIN_async_unmap_vars (e->addr);
append_async = true;
}
break;
}
@ -838,6 +841,7 @@ event_gc (bool memmap_lockable)
cuEventDestroy (*te);
free ((void *)te);
/* Unlink 'e' from ptx_events list. */
if (ptx_events == e)
ptx_events = ptx_events->next;
else
@ -848,15 +852,31 @@ event_gc (bool memmap_lockable)
e_->next = e_->next->next;
}
free (e);
if (append_async)
{
e->next = async_cleanups;
async_cleanups = e;
}
else
free (e);
}
}
pthread_mutex_unlock (&ptx_event_lock);
/* We have to do these here, after ptx_event_lock is released. */
while (async_cleanups)
{
struct ptx_event *e = async_cleanups;
async_cleanups = async_cleanups->next;
GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
free (e);
}
}
static void
event_add (enum ptx_event_type type, CUevent *e, void *h)
event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
{
struct ptx_event *ptx_event;
struct nvptx_thread *nvthd = nvptx_thread ();
@ -869,6 +889,7 @@ event_add (enum ptx_event_type type, CUevent *e, void *h)
ptx_event->evt = e;
ptx_event->addr = h;
ptx_event->ord = nvthd->ptx_dev->ord;
ptx_event->val = val;
pthread_mutex_lock (&ptx_event_lock);
@ -975,7 +996,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
event_add (PTX_EVT_KNL, e, (void *)dev_str);
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
}
#else
r = cuCtxSynchronize ();
@ -1071,7 +1092,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
CUDA_CALL (cuMemcpyHtoDAsync,
(CUdeviceptr) d, h, s, nvthd->current_stream->stream);
CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_MEM, e, (void *)h);
event_add (PTX_EVT_MEM, e, (void *)h, 0);
}
else
#endif
@ -1127,7 +1148,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
CUDA_CALL (cuMemcpyDtoHAsync,
h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_MEM, e, (void *)h);
event_add (PTX_EVT_MEM, e, (void *)h, 0);
}
else
#endif
@ -1240,7 +1261,7 @@ nvptx_wait_async (int async1, int async2)
CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
event_add (PTX_EVT_SYNC, e, NULL);
event_add (PTX_EVT_SYNC, e, NULL, 0);
CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
}
@ -1313,7 +1334,7 @@ nvptx_wait_all_async (int async)
/* Record an event on the waited-for stream. */
CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
event_add (PTX_EVT_SYNC, e, NULL);
event_add (PTX_EVT_SYNC, e, NULL, 0);
CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
}
@ -1646,14 +1667,14 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
}
void
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
{
struct nvptx_thread *nvthd = nvptx_thread ();
CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
}
int

View File

@ -707,7 +707,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->list[i].offset = 0;
tgt->list[i].length = k->host_end - k->host_start;
k->refcount = 1;
k->async_refcount = 0;
tgt->refcount++;
array->left = NULL;
array->right = NULL;
@ -854,43 +853,9 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
free (tgt);
}
/* Decrease the refcount for a set of mapped variables, and queue asychronous
copies from the device back to the host after any work that has been issued.
Because the regions are still "live", increment an asynchronous reference
count to indicate that they should not be unmapped from host-side data
structures until the asynchronous copy has completed. */
attribute_hidden void
gomp_copy_from_async (struct target_mem_desc *tgt)
{
struct gomp_device_descr *devicep = tgt->device_descr;
size_t i;
gomp_mutex_lock (&devicep->lock);
for (i = 0; i < tgt->list_count; i++)
if (tgt->list[i].key == NULL)
;
else if (tgt->list[i].key->refcount > 1)
{
tgt->list[i].key->refcount--;
tgt->list[i].key->async_refcount++;
}
else
{
splay_tree_key k = tgt->list[i].key;
if (tgt->list[i].copy_from)
gomp_copy_dev2host (devicep, (void *) k->host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset),
k->host_end - k->host_start);
}
gomp_mutex_unlock (&devicep->lock);
}
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
variables back from device to host: if it is false, it is assumed that this
has been done already, i.e. by gomp_copy_from_async above. */
has been done already. */
attribute_hidden void
gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
@ -924,13 +889,8 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
k->refcount--;
else if (k->refcount == 1)
{
if (k->async_refcount > 0)
k->async_refcount--;
else
{
k->refcount--;
do_unmap = true;
}
k->refcount--;
do_unmap = true;
}
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
@ -1076,7 +1036,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
k->tgt = tgt;
k->tgt_offset = target_table[i].start;
k->refcount = REFCOUNT_INFINITY;
k->async_refcount = 0;
k->link_key = NULL;
array->left = NULL;
array->right = NULL;
@ -1109,7 +1068,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
k->tgt = tgt;
k->tgt_offset = target_var->start;
k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
k->async_refcount = 0;
k->link_key = NULL;
array->left = NULL;
array->right = NULL;
@ -2332,7 +2290,6 @@ omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
k->tgt = tgt;
k->tgt_offset = (uintptr_t) device_ptr + device_offset;
k->refcount = REFCOUNT_INFINITY;
k->async_refcount = 0;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);