2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>

Reviewed-by: Thomas Schwinge <thomas@codesourcery.com>

	libgomp/
	* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
	(struct goacc_asyncqueue_list): Likewise.
	(goacc_aq): Likewise.
	(goacc_aq_list): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_async_test): Remove.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust declaration.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
	(GOMP_OFFLOAD_openacc_async_exec): Declare.
	(GOMP_OFFLOAD_openacc_async_construct): Declare.
	(GOMP_OFFLOAD_openacc_async_destruct): Declare.
	(GOMP_OFFLOAD_openacc_async_test): Declare.
	(GOMP_OFFLOAD_openacc_async_synchronize): Declare.
	(GOMP_OFFLOAD_openacc_async_serialize): Declare.
	(GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
	(GOMP_OFFLOAD_openacc_async_host2dev): Declare.
	(GOMP_OFFLOAD_openacc_async_dev2host): Declare.

	* libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
	(gomp_acc_insert_pointer): Adjust declaration.
	(gomp_copy_host2dev): New declaration.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_async): Likewise.
	(gomp_unmap_tgt): Likewise.
	(gomp_unmap_vars_async): Likewise.
	(gomp_fini_device): Likewise.

	* oacc-async.c (get_goacc_thread): New function.
	(get_goacc_thread_device): New function.
	(lookup_goacc_asyncqueue): New function.
	(get_goacc_asyncqueue): New function.
	(acc_async_test): Adjust code to use new async design.
	(acc_async_test_all): Likewise.
	(acc_wait): Likewise.
	(acc_wait_async): Likewise.
	(acc_wait_all): Likewise.
	(acc_wait_all_async): Likewise.
	(goacc_async_free): New function.
	(goacc_init_asyncqueues): Likewise.
	(goacc_fini_asyncqueues): Likewise.
	* oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
	design.
	(acc_set_cuda_stream): Likewise.
	* oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'.
	(host_openacc_register_async_cleanup): Remove.
	(host_openacc_async_exec): New function.
	(host_openacc_async_test): Adjust parameters.
	(host_openacc_async_test_all): Remove.
	(host_openacc_async_wait): Remove.
	(host_openacc_async_wait_async): Remove.
	(host_openacc_async_wait_all): Remove.
	(host_openacc_async_wait_all_async): Remove.
	(host_openacc_async_set_async): Remove.
	(host_openacc_async_synchronize): New function.
	(host_openacc_async_serialize): New function.
	(host_openacc_async_host2dev): New function.
	(host_openacc_async_dev2host): New function.
	(host_openacc_async_queue_callback): New function.
	(host_openacc_async_construct): New function.
	(host_openacc_async_destruct): New function.
	(struct gomp_device_descr host_dispatch): Remove initialization of old
	interface, add intialization of new async sub-struct.
	* oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
	(goacc_attach_host_thread_to_device): Remove old async code usage.
	* oacc-int.h (goacc_init_asyncqueues): New declaration.
	(goacc_fini_asyncqueues): Likewise.
	(goacc_async_copyout_unmap_vars): Likewise.
	(goacc_async_free): Likewise.
	(get_goacc_asyncqueue): Likewise.
	(lookup_goacc_asyncqueue): Likewise.

	* oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
	design.
	(present_create_copy): Adjust code to use new async design.
	(delete_copyout): Likewise.
	(update_dev_host): Likewise.
	(gomp_acc_insert_pointer): Add async parameter, adjust code to use new
	async design.
	(gomp_acc_remove_pointer): Adjust code to use new async design.
	* oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async
	design.
	(GOACC_enter_exit_data): Likewise.
	(goacc_wait): Likewise.
	(GOACC_update): Likewise.
	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
	when called, warn as obsolete in comment.

	* target.c (goacc_device_copy_async): New function.
	(gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
	add goacc_device_copy_async case.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
	(gomp_map_pointer): Likewise.
	(gomp_map_fields_existing): Likewise.
	(gomp_map_vars_internal): New always_inline function, renamed from
	gomp_map_vars.
	(gomp_map_vars): Implement by calling gomp_map_vars_internal.
	(gomp_map_vars_async): Implement by calling gomp_map_vars_internal,
	passing goacc_asyncqueue argument.
	(gomp_unmap_tgt): Remove static, add attribute_hidden.
	(gomp_unref_tgt): New function.
	(gomp_unmap_vars_internal): New always_inline function, renamed from
	gomp_unmap_vars.
	(gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal.
	(gomp_unmap_vars_async): Implement by calling
	gomp_unmap_vars_internal, passing goacc_asyncqueue argument.
	(gomp_fini_device): New function.
	(gomp_exit_data): Adjust gomp_copy_dev2host call.
	(gomp_load_plugin_for_device): Remove old interface, adjust to load
	new async interface.
	(gomp_target_fini): Adjust code to call gomp_fini_device.

	* plugin/plugin-nvptx.c (struct cuda_map): Remove.
	(struct ptx_stream): Remove.
	(struct nvptx_thread): Remove current_stream field.
	(cuda_map_create): Remove.
	(cuda_map_destroy): Remove.
	(map_init): Remove.
	(map_fini): Remove.
	(map_pop): Remove.
	(map_push): Remove.
	(struct goacc_asyncqueue): Define.
	(struct nvptx_callback): Define.
	(struct ptx_free_block): Define.
	(struct ptx_device): Remove null_stream, active_streams, async_streams,
	stream_lock, and next fields.
	(enum ptx_event_type): Remove.
	(struct ptx_event): Remove.
	(ptx_event_lock): Remove.
	(ptx_events): Remove.
	(init_streams_for_device): Remove.
	(fini_streams_for_device): Remove.
	(select_stream_for_async): Remove.
	(nvptx_init): Remove ptx_events and ptx_event_lock references.
	(nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
	case.
	(nvptx_open_device): Add free_blocks initialization, remove
	init_streams_for_device call.
	(nvptx_close_device): Remove fini_streams_for_device call, add
	free_blocks destruct code.
	(event_gc): Remove.
	(event_add): Remove.
	(nvptx_exec): Adjust parameters and code.
	(nvptx_free): Likewise.
	(nvptx_host2dev): Remove.
	(nvptx_dev2host): Remove.
	(nvptx_set_async): Remove.
	(nvptx_async_test): Remove.
	(nvptx_async_test_all): Remove.
	(nvptx_wait): Remove.
	(nvptx_wait_async): Remove.
	(nvptx_wait_all): Remove.
	(nvptx_wait_all_async): Remove.
	(nvptx_get_cuda_stream): Remove.
	(nvptx_set_cuda_stream): Remove.
	(GOMP_OFFLOAD_alloc): Adjust code.
	(GOMP_OFFLOAD_free): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(cuda_free_argmem): New function.
	(GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
	(GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
	(cuda_callback_wrapper): New function.
	(cuda_memcpy_sanity_check): New function.
	(GOMP_OFFLOAD_host2dev): Remove and re-implement.
	(GOMP_OFFLOAD_dev2host): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.

From-SVN: r271128
This commit is contained in:
Chung-Lin Tang 2019-05-13 13:32:00 +00:00 committed by Chung-Lin Tang
parent f78f5d2392
commit 1f4c5b9bb2
15 changed files with 1117 additions and 1280 deletions

View File

@ -1,3 +1,193 @@
2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
(struct goacc_asyncqueue_list): Likewise.
(goacc_aq): Likewise.
(goacc_aq_list): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
(GOMP_OFFLOAD_openacc_async_test): Remove.
(GOMP_OFFLOAD_openacc_async_test_all): Remove.
(GOMP_OFFLOAD_openacc_async_wait): Remove.
(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
(GOMP_OFFLOAD_openacc_async_set_async): Remove.
(GOMP_OFFLOAD_openacc_exec): Adjust declaration.
(GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
(GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
(GOMP_OFFLOAD_openacc_async_exec): Declare.
(GOMP_OFFLOAD_openacc_async_construct): Declare.
(GOMP_OFFLOAD_openacc_async_destruct): Declare.
(GOMP_OFFLOAD_openacc_async_test): Declare.
(GOMP_OFFLOAD_openacc_async_synchronize): Declare.
(GOMP_OFFLOAD_openacc_async_serialize): Declare.
(GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
(GOMP_OFFLOAD_openacc_async_host2dev): Declare.
(GOMP_OFFLOAD_openacc_async_dev2host): Declare.
* libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
(gomp_acc_insert_pointer): Adjust declaration.
(gomp_copy_host2dev): New declaration.
(gomp_copy_dev2host): Likewise.
(gomp_map_vars_async): Likewise.
(gomp_unmap_tgt): Likewise.
(gomp_unmap_vars_async): Likewise.
(gomp_fini_device): Likewise.
* oacc-async.c (get_goacc_thread): New function.
(get_goacc_thread_device): New function.
(lookup_goacc_asyncqueue): New function.
(get_goacc_asyncqueue): New function.
(acc_async_test): Adjust code to use new async design.
(acc_async_test_all): Likewise.
(acc_wait): Likewise.
(acc_wait_async): Likewise.
(acc_wait_all): Likewise.
(acc_wait_all_async): Likewise.
(goacc_async_free): New function.
(goacc_init_asyncqueues): Likewise.
(goacc_fini_asyncqueues): Likewise.
* oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
design.
(acc_set_cuda_stream): Likewise.
* oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'.
(host_openacc_register_async_cleanup): Remove.
(host_openacc_async_exec): New function.
(host_openacc_async_test): Adjust parameters.
(host_openacc_async_test_all): Remove.
(host_openacc_async_wait): Remove.
(host_openacc_async_wait_async): Remove.
(host_openacc_async_wait_all): Remove.
(host_openacc_async_wait_all_async): Remove.
(host_openacc_async_set_async): Remove.
(host_openacc_async_synchronize): New function.
(host_openacc_async_serialize): New function.
(host_openacc_async_host2dev): New function.
(host_openacc_async_dev2host): New function.
(host_openacc_async_queue_callback): New function.
(host_openacc_async_construct): New function.
(host_openacc_async_destruct): New function.
(struct gomp_device_descr host_dispatch): Remove initialization of old
interface, add intialization of new async sub-struct.
* oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
(goacc_attach_host_thread_to_device): Remove old async code usage.
* oacc-int.h (goacc_init_asyncqueues): New declaration.
(goacc_fini_asyncqueues): Likewise.
(goacc_async_copyout_unmap_vars): Likewise.
(goacc_async_free): Likewise.
(get_goacc_asyncqueue): Likewise.
(lookup_goacc_asyncqueue): Likewise.
* oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
design.
(present_create_copy): Adjust code to use new async design.
(delete_copyout): Likewise.
(update_dev_host): Likewise.
(gomp_acc_insert_pointer): Add async parameter, adjust code to use new
async design.
(gomp_acc_remove_pointer): Adjust code to use new async design.
* oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async
design.
(GOACC_enter_exit_data): Likewise.
(goacc_wait): Likewise.
(GOACC_update): Likewise.
* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
when called, warn as obsolete in comment.
* target.c (goacc_device_copy_async): New function.
(gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
add goacc_device_copy_async case.
(gomp_copy_dev2host): Likewise.
(gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
(gomp_map_pointer): Likewise.
(gomp_map_fields_existing): Likewise.
(gomp_map_vars_internal): New always_inline function, renamed from
gomp_map_vars.
(gomp_map_vars): Implement by calling gomp_map_vars_internal.
(gomp_map_vars_async): Implement by calling gomp_map_vars_internal,
passing goacc_asyncqueue argument.
(gomp_unmap_tgt): Remove static, add attribute_hidden.
(gomp_unref_tgt): New function.
(gomp_unmap_vars_internal): New always_inline function, renamed from
gomp_unmap_vars.
(gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal.
(gomp_unmap_vars_async): Implement by calling
gomp_unmap_vars_internal, passing goacc_asyncqueue argument.
(gomp_fini_device): New function.
(gomp_exit_data): Adjust gomp_copy_dev2host call.
(gomp_load_plugin_for_device): Remove old interface, adjust to load
new async interface.
(gomp_target_fini): Adjust code to call gomp_fini_device.
* plugin/plugin-nvptx.c (struct cuda_map): Remove.
(struct ptx_stream): Remove.
(struct nvptx_thread): Remove current_stream field.
(cuda_map_create): Remove.
(cuda_map_destroy): Remove.
(map_init): Remove.
(map_fini): Remove.
(map_pop): Remove.
(map_push): Remove.
(struct goacc_asyncqueue): Define.
(struct nvptx_callback): Define.
(struct ptx_free_block): Define.
(struct ptx_device): Remove null_stream, active_streams, async_streams,
stream_lock, and next fields.
(enum ptx_event_type): Remove.
(struct ptx_event): Remove.
(ptx_event_lock): Remove.
(ptx_events): Remove.
(init_streams_for_device): Remove.
(fini_streams_for_device): Remove.
(select_stream_for_async): Remove.
(nvptx_init): Remove ptx_events and ptx_event_lock references.
(nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
case.
(nvptx_open_device): Add free_blocks initialization, remove
init_streams_for_device call.
(nvptx_close_device): Remove fini_streams_for_device call, add
free_blocks destruct code.
(event_gc): Remove.
(event_add): Remove.
(nvptx_exec): Adjust parameters and code.
(nvptx_free): Likewise.
(nvptx_host2dev): Remove.
(nvptx_dev2host): Remove.
(nvptx_set_async): Remove.
(nvptx_async_test): Remove.
(nvptx_async_test_all): Remove.
(nvptx_wait): Remove.
(nvptx_wait_async): Remove.
(nvptx_wait_all): Remove.
(nvptx_wait_all_async): Remove.
(nvptx_get_cuda_stream): Remove.
(nvptx_set_cuda_stream): Remove.
(GOMP_OFFLOAD_alloc): Adjust code.
(GOMP_OFFLOAD_free): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
(GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
(GOMP_OFFLOAD_openacc_async_test_all): Remove.
(GOMP_OFFLOAD_openacc_async_wait): Remove.
(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
(GOMP_OFFLOAD_openacc_async_set_async): Remove.
(cuda_free_argmem): New function.
(GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
(GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
(GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
(GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
(GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
(GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
(GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
(GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
(GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
(GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
(cuda_callback_wrapper): New function.
(cuda_memcpy_sanity_check): New function.
(GOMP_OFFLOAD_host2dev): Remove and re-implement.
(GOMP_OFFLOAD_dev2host): Remove and re-implement.
(GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
(GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.
2019-05-07 Thomas Schwinge <thomas@codesourcery.com> 2019-05-07 Thomas Schwinge <thomas@codesourcery.com>
PR target/87835 PR target/87835

View File

@ -53,6 +53,20 @@ enum offload_target_type
OFFLOAD_TARGET_TYPE_HSA = 7 OFFLOAD_TARGET_TYPE_HSA = 7
}; };
/* Opaque type to represent plugin-dependent implementation of an
OpenACC asynchronous queue. */
struct goacc_asyncqueue;
/* Used to keep a list of active asynchronous queues. */
struct goacc_asyncqueue_list
{
struct goacc_asyncqueue *aq;
struct goacc_asyncqueue_list *next;
};
typedef struct goacc_asyncqueue *goacc_aq;
typedef struct goacc_asyncqueue_list *goacc_aq_list;
/* Auxiliary struct, used for transferring pairs of addresses from plugin /* Auxiliary struct, used for transferring pairs of addresses from plugin
to libgomp. */ to libgomp. */
struct addr_pair struct addr_pair
@ -93,22 +107,31 @@ extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_can_run (void *); extern bool GOMP_OFFLOAD_can_run (void *);
extern void GOMP_OFFLOAD_run (int, void *, void *, void **); extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *); extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **, extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
void **, int, unsigned *, void *); void **, unsigned *, void *);
extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
extern int GOMP_OFFLOAD_openacc_async_test (int);
extern int GOMP_OFFLOAD_openacc_async_test_all (void);
extern void GOMP_OFFLOAD_openacc_async_wait (int);
extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
extern void GOMP_OFFLOAD_openacc_async_set_async (int);
extern void *GOMP_OFFLOAD_openacc_create_thread_data (int); extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *); extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
struct goacc_asyncqueue *);
extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
void (*)(void *), void *);
extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
void **, unsigned *, void *,
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int); extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *); extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
void *);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -949,24 +949,31 @@ typedef struct acc_dispatch_t
/* Execute. */ /* Execute. */
__typeof (GOMP_OFFLOAD_openacc_exec) *exec_func; __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
/* Async cleanup callback registration. */
__typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
*register_async_cleanup_func;
/* Asynchronous routines. */
__typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
__typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
__typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
__typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
__typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
__typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
*async_wait_all_async_func;
__typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
/* Create/destroy TLS data. */ /* Create/destroy TLS data. */
__typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func; __typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
__typeof (GOMP_OFFLOAD_openacc_destroy_thread_data) __typeof (GOMP_OFFLOAD_openacc_destroy_thread_data)
*destroy_thread_data_func; *destroy_thread_data_func;
struct {
/* Once created and put into the "active" list, asyncqueues are then never
destructed and removed from the "active" list, other than if the TODO
device is shut down. */
gomp_mutex_t lock;
int nasyncqueue;
struct goacc_asyncqueue **asyncqueue;
struct goacc_asyncqueue_list *active;
__typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
__typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
__typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
__typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
__typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
__typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
__typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
__typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
} async;
/* NVIDIA target specific routines. */ /* NVIDIA target specific routines. */
struct { struct {
@ -1053,17 +1060,33 @@ enum gomp_map_vars_kind
GOMP_MAP_VARS_ENTER_DATA GOMP_MAP_VARS_ENTER_DATA
}; };
extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int); extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
unsigned short *); unsigned short *);
struct gomp_coalesce_buf;
extern void gomp_copy_host2dev (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t, struct gomp_coalesce_buf *);
extern void gomp_copy_dev2host (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
size_t, void **, void **, size_t, void **, void **,
size_t *, void *, bool, size_t *, void *, bool,
enum gomp_map_vars_kind); enum gomp_map_vars_kind);
extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
struct goacc_asyncqueue *,
size_t, void **, void **,
size_t *, void *, bool,
enum gomp_map_vars_kind);
extern void gomp_unmap_tgt (struct target_mem_desc *);
extern void gomp_unmap_vars (struct target_mem_desc *, bool); extern void gomp_unmap_vars (struct target_mem_desc *, bool);
extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
struct goacc_asyncqueue *);
extern void gomp_init_device (struct gomp_device_descr *); extern void gomp_init_device (struct gomp_device_descr *);
extern bool gomp_fini_device (struct gomp_device_descr *);
extern void gomp_free_memmap (struct splay_tree_s *); extern void gomp_free_memmap (struct splay_tree_s *);
extern void gomp_unload_device (struct gomp_device_descr *); extern void gomp_unload_device (struct gomp_device_descr *);
extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key); extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);

View File

@ -27,47 +27,160 @@
<http://www.gnu.org/licenses/>. */ <http://www.gnu.org/licenses/>. */
#include <assert.h> #include <assert.h>
#include <string.h>
#include "openacc.h" #include "openacc.h"
#include "libgomp.h" #include "libgomp.h"
#include "oacc-int.h" #include "oacc-int.h"
int static struct goacc_thread *
acc_async_test (int async) get_goacc_thread (void)
{ {
if (!async_valid_p (async))
gomp_fatal ("invalid async argument: %d", async);
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev) if (!thr || !thr->dev)
gomp_fatal ("no device active"); gomp_fatal ("no device active");
return thr->dev->openacc.async_test_func (async); return thr;
}
static struct gomp_device_descr *
get_goacc_thread_device (void)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
return thr->dev;
}
static int
validate_async_val (int async)
{
if (!async_valid_p (async))
gomp_fatal ("invalid async-argument: %d", async);
if (async == acc_async_sync)
return -1;
if (async == acc_async_noval)
return 0;
if (async >= 0)
/* TODO: we reserve 0 for acc_async_noval before we can clarify the
semantics of "default_async". */
return 1 + async;
else
__builtin_unreachable ();
}
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
create the asyncqueue if it doesn't exist yet. */
attribute_hidden struct goacc_asyncqueue *
lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
{
async = validate_async_val (async);
if (async < 0)
return NULL;
struct goacc_asyncqueue *ret_aq = NULL;
struct gomp_device_descr *dev = thr->dev;
gomp_mutex_lock (&dev->openacc.async.lock);
if (!create
&& (async >= dev->openacc.async.nasyncqueue
|| !dev->openacc.async.asyncqueue[async]))
goto end;
if (async >= dev->openacc.async.nasyncqueue)
{
int diff = async + 1 - dev->openacc.async.nasyncqueue;
dev->openacc.async.asyncqueue
= gomp_realloc (dev->openacc.async.asyncqueue,
sizeof (goacc_aq) * (async + 1));
memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
0, sizeof (goacc_aq) * diff);
dev->openacc.async.nasyncqueue = async + 1;
}
if (!dev->openacc.async.asyncqueue[async])
{
dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
if (!dev->openacc.async.asyncqueue[async])
{
gomp_mutex_unlock (&dev->openacc.async.lock);
gomp_fatal ("async %d creation failed", async);
}
/* Link new async queue into active list. */
goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
n->aq = dev->openacc.async.asyncqueue[async];
n->next = dev->openacc.async.active;
dev->openacc.async.active = n;
}
ret_aq = dev->openacc.async.asyncqueue[async];
end:
gomp_mutex_unlock (&dev->openacc.async.lock);
return ret_aq;
}
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
might return NULL if no asyncqueue is to be used. Otherwise, create the
asyncqueue if it doesn't exist yet. */
attribute_hidden struct goacc_asyncqueue *
get_goacc_asyncqueue (int async)
{
struct goacc_thread *thr = get_goacc_thread ();
return lookup_goacc_asyncqueue (thr, true, async);
}
int
acc_async_test (int async)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (!aq)
return 1;
else
return thr->dev->openacc.async.test_func (aq);
} }
int int
acc_async_test_all (void) acc_async_test_all (void)
{ {
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = get_goacc_thread ();
if (!thr || !thr->dev) int ret = 1;
gomp_fatal ("no device active"); gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
return thr->dev->openacc.async_test_all_func (); if (!thr->dev->openacc.async.test_func (l->aq))
{
ret = 0;
break;
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
return ret;
} }
void void
acc_wait (int async) acc_wait (int async)
{ {
if (!async_valid_p (async)) struct goacc_thread *thr = get_goacc_thread ();
gomp_fatal ("invalid async argument: %d", async);
struct goacc_thread *thr = goacc_thread (); goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq && !thr->dev->openacc.async.synchronize_func (aq))
if (!thr || !thr->dev) gomp_fatal ("wait on %d failed", async);
gomp_fatal ("no device active");
thr->dev->openacc.async_wait_func (async);
} }
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */ /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
@ -84,23 +197,46 @@ acc_async_wait (int async)
void void
acc_wait_async (int async1, int async2) acc_wait_async (int async1, int async2)
{ {
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = get_goacc_thread ();
if (!thr || !thr->dev) goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
gomp_fatal ("no device active"); /* TODO: Is this also correct for acc_async_sync, assuming that in this case,
we'll always be synchronous anyways? */
if (!aq1)
return;
thr->dev->openacc.async_wait_async_func (async1, async2); goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
/* An async queue is always synchronized with itself. */
if (aq1 == aq2)
return;
if (aq2)
{
if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
}
else
{
/* TODO: Local thread synchronization.
Necessary for the "async2 == acc_async_sync" case, or can just skip? */
if (!thr->dev->openacc.async.synchronize_func (aq1))
gomp_fatal ("wait on %d failed", async1);
}
} }
void void
acc_wait_all (void) acc_wait_all (void)
{ {
struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *dev = get_goacc_thread_device ();
if (!thr || !thr->dev) bool ret = true;
gomp_fatal ("no device active"); gomp_mutex_lock (&dev->openacc.async.lock);
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
ret &= dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&dev->openacc.async.lock);
thr->dev->openacc.async_wait_all_func (); if (!ret)
gomp_fatal ("wait all failed");
} }
/* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */ /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */
@ -117,13 +253,73 @@ acc_async_wait_all (void)
void void
acc_wait_all_async (int async) acc_wait_all_async (int async)
{ {
if (!async_valid_p (async)) struct goacc_thread *thr = get_goacc_thread ();
gomp_fatal ("invalid async argument: %d", async);
struct goacc_thread *thr = goacc_thread (); goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
if (!thr || !thr->dev) bool ret = true;
gomp_fatal ("no device active"); gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
{
if (waiting_queue)
ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
else
/* TODO: Local thread synchronization.
Necessary for the "async2 == acc_async_sync" case, or can just skip? */
ret &= thr->dev->openacc.async.synchronize_func (l->aq);
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
thr->dev->openacc.async_wait_all_async_func (async); if (!ret)
gomp_fatal ("wait all async(%d) failed", async);
}
attribute_hidden void
goacc_async_free (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq, void *ptr)
{
if (!aq)
free (ptr);
else
devicep->openacc.async.queue_callback_func (aq, free, ptr);
}
/* This function initializes the asyncqueues for the device specified by
DEVICEP. TODO DEVICEP must be locked on entry, and remains locked on
return. */
attribute_hidden void
goacc_init_asyncqueues (struct gomp_device_descr *devicep)
{
devicep->openacc.async.nasyncqueue = 0;
devicep->openacc.async.asyncqueue = NULL;
devicep->openacc.async.active = NULL;
gomp_mutex_init (&devicep->openacc.async.lock);
}
/* This function finalizes the asyncqueues for the device specified by DEVICEP.
TODO DEVICEP must be locked on entry, and remains locked on return. */
attribute_hidden bool
goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
{
bool ret = true;
gomp_mutex_lock (&devicep->openacc.async.lock);
if (devicep->openacc.async.nasyncqueue > 0)
{
goacc_aq_list next;
for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
{
ret &= devicep->openacc.async.destruct_func (l->aq);
next = l->next;
free (l);
}
free (devicep->openacc.async.asyncqueue);
devicep->openacc.async.nasyncqueue = 0;
devicep->openacc.async.asyncqueue = NULL;
devicep->openacc.async.active = NULL;
}
gomp_mutex_unlock (&devicep->openacc.async.lock);
gomp_mutex_destroy (&devicep->openacc.async.lock);
return ret;
} }

View File

@ -30,6 +30,7 @@
#include "config.h" #include "config.h"
#include "libgomp.h" #include "libgomp.h"
#include "oacc-int.h" #include "oacc-int.h"
#include <assert.h>
void * void *
acc_get_current_cuda_device (void) acc_get_current_cuda_device (void)
@ -62,7 +63,11 @@ acc_get_cuda_stream (int async)
return NULL; return NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
return thr->dev->openacc.cuda.get_stream_func (async); {
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq)
return thr->dev->openacc.cuda.get_stream_func (aq);
}
return NULL; return NULL;
} }
@ -79,8 +84,23 @@ acc_set_cuda_stream (int async, void *stream)
thr = goacc_thread (); thr = goacc_thread ();
int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
return thr->dev->openacc.cuda.set_stream_func (async, stream); {
goacc_aq aq = get_goacc_asyncqueue (async);
/* Due to not using an asyncqueue for "acc_async_sync", this cannot be
used to change the CUDA stream associated with "acc_async_sync". */
if (!aq)
{
assert (async == acc_async_sync);
gomp_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n");
return 0;
}
gomp_mutex_lock (&thr->dev->openacc.async.lock);
ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
}
return -1; return ret;
} }

View File

@ -140,55 +140,89 @@ host_openacc_exec (void (*fn) (void *),
size_t mapnum __attribute__ ((unused)), size_t mapnum __attribute__ ((unused)),
void **hostaddrs, void **hostaddrs,
void **devaddrs __attribute__ ((unused)), void **devaddrs __attribute__ ((unused)),
int async __attribute__ ((unused)), unsigned *dims __attribute__ ((unused)),
unsigned *dims __attribute ((unused)),
void *targ_mem_desc __attribute__ ((unused))) void *targ_mem_desc __attribute__ ((unused)))
{ {
fn (hostaddrs); fn (hostaddrs);
} }
static void static void
host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)), host_openacc_async_exec (void (*fn) (void *),
int async __attribute__ ((unused))) size_t mapnum __attribute__ ((unused)),
void **hostaddrs,
void **devaddrs __attribute__ ((unused)),
unsigned *dims __attribute__ ((unused)),
void *targ_mem_desc __attribute__ ((unused)),
struct goacc_asyncqueue *aq __attribute__ ((unused)))
{ {
fn (hostaddrs);
} }
static int static int
host_openacc_async_test (int async __attribute__ ((unused))) host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
{ {
return 1; return 1;
} }
static int static bool
host_openacc_async_test_all (void) host_openacc_async_synchronize (struct goacc_asyncqueue *aq
__attribute__ ((unused)))
{ {
return 1; return true;
}
static bool
host_openacc_async_serialize (struct goacc_asyncqueue *aq1
__attribute__ ((unused)),
struct goacc_asyncqueue *aq2
__attribute__ ((unused)))
{
return true;
}
static bool
host_openacc_async_host2dev (int ord __attribute__ ((unused)),
void *dst __attribute__ ((unused)),
const void *src __attribute__ ((unused)),
size_t n __attribute__ ((unused)),
struct goacc_asyncqueue *aq
__attribute__ ((unused)))
{
return true;
}
static bool
host_openacc_async_dev2host (int ord __attribute__ ((unused)),
void *dst __attribute__ ((unused)),
const void *src __attribute__ ((unused)),
size_t n __attribute__ ((unused)),
struct goacc_asyncqueue *aq
__attribute__ ((unused)))
{
return true;
} }
static void static void
host_openacc_async_wait (int async __attribute__ ((unused))) host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
__attribute__ ((unused)),
void (*callback_fn)(void *)
__attribute__ ((unused)),
void *userptr __attribute__ ((unused)))
{ {
} }
static void static struct goacc_asyncqueue *
host_openacc_async_wait_async (int async1 __attribute__ ((unused)), host_openacc_async_construct (void)
int async2 __attribute__ ((unused)))
{ {
/* Non-NULL 0xffff... value as opaque dummy. */
return (struct goacc_asyncqueue *) -1;
} }
static void static bool
host_openacc_async_wait_all (void) host_openacc_async_destruct (struct goacc_asyncqueue *aq
{ __attribute__ ((unused)))
}
static void
host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
{
}
static void
host_openacc_async_set_async (int async __attribute__ ((unused)))
{ {
return true;
} }
static void * static void *
@ -235,19 +269,21 @@ static struct gomp_device_descr host_dispatch =
.exec_func = host_openacc_exec, .exec_func = host_openacc_exec,
.register_async_cleanup_func = host_openacc_register_async_cleanup,
.async_test_func = host_openacc_async_test,
.async_test_all_func = host_openacc_async_test_all,
.async_wait_func = host_openacc_async_wait,
.async_wait_async_func = host_openacc_async_wait_async,
.async_wait_all_func = host_openacc_async_wait_all,
.async_wait_all_async_func = host_openacc_async_wait_all_async,
.async_set_async_func = host_openacc_async_set_async,
.create_thread_data_func = host_openacc_create_thread_data, .create_thread_data_func = host_openacc_create_thread_data,
.destroy_thread_data_func = host_openacc_destroy_thread_data, .destroy_thread_data_func = host_openacc_destroy_thread_data,
.async = {
.construct_func = host_openacc_async_construct,
.destruct_func = host_openacc_async_destruct,
.test_func = host_openacc_async_test,
.synchronize_func = host_openacc_async_synchronize,
.serialize_func = host_openacc_async_serialize,
.queue_callback_func = host_openacc_async_queue_callback,
.exec_func = host_openacc_async_exec,
.dev2host_func = host_openacc_async_dev2host,
.host2dev_func = host_openacc_async_host2dev,
},
.cuda = { .cuda = {
.get_current_device_func = NULL, .get_current_device_func = NULL,
.get_current_context_func = NULL, .get_current_context_func = NULL,

View File

@ -309,7 +309,7 @@ acc_shutdown_1 (acc_device_t d)
if (acc_dev->state == GOMP_DEVICE_INITIALIZED) if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
{ {
devices_active = true; devices_active = true;
ret &= acc_dev->fini_device_func (acc_dev->target_id); ret &= gomp_fini_device (acc_dev);
acc_dev->state = GOMP_DEVICE_UNINITIALIZED; acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
} }
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
@ -426,8 +426,6 @@ goacc_attach_host_thread_to_device (int ord)
thr->target_tls thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord); = acc_dev->openacc.create_thread_data_func (ord);
acc_dev->openacc.async_set_async_func (acc_async_sync);
} }
/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of

View File

@ -99,6 +99,13 @@ void goacc_restore_bind (void);
void goacc_lazy_initialize (void); void goacc_lazy_initialize (void);
void goacc_host_init (void); void goacc_host_init (void);
void goacc_init_asyncqueues (struct gomp_device_descr *);
bool goacc_fini_asyncqueues (struct gomp_device_descr *);
void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
void *);
struct goacc_asyncqueue *get_goacc_asyncqueue (int);
struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
int);
static inline bool static inline bool
async_valid_stream_id_p (int async) async_valid_stream_id_p (int async)
{ {

View File

@ -172,18 +172,11 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
return; return;
} }
if (async > acc_async_sync) goacc_aq aq = get_goacc_asyncqueue (async);
thr->dev->openacc.async_set_async_func (async); if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
bool ret = (from else
? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
: thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
if (async > acc_async_sync)
thr->dev->openacc.async_set_async_func (acc_async_sync);
if (!ret)
gomp_fatal ("error in %s", libfnname);
} }
void void
@ -509,17 +502,13 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
if (async > acc_async_sync) goacc_aq aq = get_goacc_asyncqueue (async);
acc_dev->openacc.async_set_async_func (async);
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
GOMP_MAP_VARS_OPENACC); &kinds, true, GOMP_MAP_VARS_OPENACC);
/* Initialize dynamic refcount. */ /* Initialize dynamic refcount. */
tgt->list[0].key->dynamic_refcount = 1; tgt->list[0].key->dynamic_refcount = 1;
if (async > acc_async_sync)
acc_dev->openacc.async_set_async_func (acc_async_sync);
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
d = tgt->to_free; d = tgt->to_free;
@ -676,13 +665,9 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (f & FLAG_COPYOUT) if (f & FLAG_COPYOUT)
{ {
if (async > acc_async_sync) goacc_aq aq = get_goacc_asyncqueue (async);
acc_dev->openacc.async_set_async_func (async); gomp_copy_dev2host (acc_dev, aq, h, d, s);
acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
if (async > acc_async_sync)
acc_dev->openacc.async_set_async_func (acc_async_sync);
} }
gomp_remove_var (acc_dev, n); gomp_remove_var (acc_dev, n);
} }
@ -765,16 +750,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
d = (void *) (n->tgt->tgt_start + n->tgt_offset d = (void *) (n->tgt->tgt_start + n->tgt_offset
+ (uintptr_t) h - n->host_start); + (uintptr_t) h - n->host_start);
if (async > acc_async_sync) goacc_aq aq = get_goacc_asyncqueue (async);
acc_dev->openacc.async_set_async_func (async);
if (is_dev) if (is_dev)
acc_dev->host2dev_func (acc_dev->target_id, d, h, s); gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
else else
acc_dev->dev2host_func (acc_dev->target_id, h, d, s); gomp_copy_dev2host (acc_dev, aq, h, d, s);
if (async > acc_async_sync)
acc_dev->openacc.async_set_async_func (acc_async_sync);
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
} }
@ -805,7 +786,7 @@ acc_update_self_async (void *h, size_t s, int async)
void void
gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
void *kinds) void *kinds, int async)
{ {
struct target_mem_desc *tgt; struct target_mem_desc *tgt;
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
@ -835,8 +816,9 @@ gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
} }
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, goacc_aq aq = get_goacc_asyncqueue (async);
NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
/* Initialize dynamic refcount. */ /* Initialize dynamic refcount. */
@ -930,7 +912,10 @@ gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
if (async < acc_async_noval) if (async < acc_async_noval)
gomp_unmap_vars (t, true); gomp_unmap_vars (t, true);
else else
t->device_descr->openacc.register_async_cleanup_func (t, async); {
goacc_aq aq = get_goacc_asyncqueue (async);
gomp_unmap_vars_async (t, true, aq);
}
} }
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);

View File

@ -217,8 +217,6 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
} }
va_end (ap); va_end (ap);
acc_dev->openacc.async_set_async_func (async);
if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC)) if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
{ {
k.host_start = (uintptr_t) fn; k.host_start = (uintptr_t) fn;
@ -235,44 +233,29 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
else else
tgt_fn = (void (*)) fn; tgt_fn = (void (*)) fn;
tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true, goacc_aq aq = get_goacc_asyncqueue (async);
GOMP_MAP_VARS_OPENACC);
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
true, GOMP_MAP_VARS_OPENACC);
devaddrs = gomp_alloca (sizeof (void *) * mapnum); devaddrs = gomp_alloca (sizeof (void *) * mapnum);
for (i = 0; i < mapnum; i++) for (i = 0; i < mapnum; i++)
devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
+ tgt->list[i].key->tgt_offset + tgt->list[i].key->tgt_offset
+ tgt->list[i].offset); + tgt->list[i].offset);
if (aq == NULL)
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, {
async, dims, tgt); acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
dims, tgt);
/* If running synchronously, unmap immediately. */ /* If running synchronously, unmap immediately. */
bool copyfrom = true; gomp_unmap_vars (tgt, true);
if (async_synchronous_p (async)) }
gomp_unmap_vars (tgt, true);
else else
{ {
bool async_unmap = false; acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
for (size_t i = 0; i < tgt->list_count; i++) dims, tgt, aq);
{ gomp_unmap_vars_async (tgt, true, aq);
splay_tree_key k = tgt->list[i].key;
if (k && k->refcount == 1)
{
async_unmap = true;
break;
}
}
if (async_unmap)
tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
else
{
copyfrom = false;
gomp_unmap_vars (tgt, copyfrom);
}
} }
acc_dev->openacc.async_set_async_func (acc_async_sync);
} }
/* Legacy entry point (GCC 5). Only provide host fallback execution. */ /* Legacy entry point (GCC 5). Only provide host fallback execution. */
@ -383,8 +366,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
finalize = true; finalize = true;
} }
acc_dev->openacc.async_set_async_func (async);
/* Determine if this is an "acc enter data". */ /* Determine if this is an "acc enter data". */
for (i = 0; i < mapnum; ++i) for (i = 0; i < mapnum; ++i)
{ {
@ -437,11 +418,11 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
{ {
case GOMP_MAP_ALLOC: case GOMP_MAP_ALLOC:
case GOMP_MAP_FORCE_ALLOC: case GOMP_MAP_FORCE_ALLOC:
acc_create (hostaddrs[i], sizes[i]); acc_create_async (hostaddrs[i], sizes[i], async);
break; break;
case GOMP_MAP_TO: case GOMP_MAP_TO:
case GOMP_MAP_FORCE_TO: case GOMP_MAP_FORCE_TO:
acc_copyin (hostaddrs[i], sizes[i]); acc_copyin_async (hostaddrs[i], sizes[i], async);
break; break;
default: default:
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@ -452,7 +433,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
else else
{ {
gomp_acc_insert_pointer (pointer, &hostaddrs[i], gomp_acc_insert_pointer (pointer, &hostaddrs[i],
&sizes[i], &kinds[i]); &sizes[i], &kinds[i], async);
/* Increment 'i' by two because OpenACC requires fortran /* Increment 'i' by two because OpenACC requires fortran
arrays to be contiguous, so each PSET is associated with arrays to be contiguous, so each PSET is associated with
one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@ -477,17 +458,17 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (acc_is_present (hostaddrs[i], sizes[i])) if (acc_is_present (hostaddrs[i], sizes[i]))
{ {
if (finalize) if (finalize)
acc_delete_finalize (hostaddrs[i], sizes[i]); acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
else else
acc_delete (hostaddrs[i], sizes[i]); acc_delete_async (hostaddrs[i], sizes[i], async);
} }
break; break;
case GOMP_MAP_FROM: case GOMP_MAP_FROM:
case GOMP_MAP_FORCE_FROM: case GOMP_MAP_FORCE_FROM:
if (finalize) if (finalize)
acc_copyout_finalize (hostaddrs[i], sizes[i]); acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
else else
acc_copyout (hostaddrs[i], sizes[i]); acc_copyout_async (hostaddrs[i], sizes[i], async);
break; break;
default: default:
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@ -505,8 +486,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
i += pointer - 1; i += pointer - 1;
} }
} }
acc_dev->openacc.async_set_async_func (acc_async_sync);
} }
static void static void
@ -532,9 +511,10 @@ goacc_wait (int async, int num_waits, va_list *ap)
if (async == acc_async_sync) if (async == acc_async_sync)
acc_wait (qid); acc_wait (qid);
else if (qid == async) else if (qid == async)
;/* If we're waiting on the same asynchronous queue as we're /* If we're waiting on the same asynchronous queue as we're
launching on, the queue itself will order work as launching on, the queue itself will order work as
required, so there's no need to wait explicitly. */ required, so there's no need to wait explicitly. */
;
else else
acc_wait_async (qid, async); acc_wait_async (qid, async);
} }
@ -567,8 +547,6 @@ GOACC_update (int flags_m, size_t mapnum,
va_end (ap); va_end (ap);
} }
acc_dev->openacc.async_set_async_func (async);
bool update_device = false; bool update_device = false;
for (i = 0; i < mapnum; ++i) for (i = 0; i < mapnum; ++i)
{ {
@ -591,6 +569,8 @@ GOACC_update (int flags_m, size_t mapnum,
the value of the allocated device memory in the the value of the allocated device memory in the
previous pointer. */ previous pointer. */
*(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr; *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
/* TODO: verify that we really cannot use acc_update_device_async
here. */
acc_update_device (hostaddrs[i], sizeof (uintptr_t)); acc_update_device (hostaddrs[i], sizeof (uintptr_t));
/* Restore the host pointer. */ /* Restore the host pointer. */
@ -608,7 +588,7 @@ GOACC_update (int flags_m, size_t mapnum,
/* Fallthru */ /* Fallthru */
case GOMP_MAP_FORCE_TO: case GOMP_MAP_FORCE_TO:
update_device = true; update_device = true;
acc_update_device (hostaddrs[i], sizes[i]); acc_update_device_async (hostaddrs[i], sizes[i], async);
break; break;
case GOMP_MAP_FROM: case GOMP_MAP_FROM:
@ -620,7 +600,7 @@ GOACC_update (int flags_m, size_t mapnum,
/* Fallthru */ /* Fallthru */
case GOMP_MAP_FORCE_FROM: case GOMP_MAP_FORCE_FROM:
update_device = false; update_device = false;
acc_update_self (hostaddrs[i], sizes[i]); acc_update_self_async (hostaddrs[i], sizes[i], async);
break; break;
default: default:
@ -628,8 +608,6 @@ GOACC_update (int flags_m, size_t mapnum,
break; break;
} }
} }
acc_dev->openacc.async_set_async_func (acc_async_sync);
} }
void void

View File

@ -30,15 +30,12 @@
#include "oacc-plugin.h" #include "oacc-plugin.h"
#include "oacc-int.h" #include "oacc-int.h"
/* This plugin function is now obsolete. */
void void
GOMP_PLUGIN_async_unmap_vars (void *ptr, int async) GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
int async __attribute__((unused)))
{ {
struct target_mem_desc *tgt = ptr; gomp_fatal ("invalid plugin function");
struct gomp_device_descr *devicep = tgt->device_descr;
devicep->openacc.async_set_async_func (async);
gomp_unmap_vars (tgt, true);
devicep->openacc.async_set_async_func (acc_async_sync);
} }
/* Return the target-specific part of the TLS data for the current thread. */ /* Return the target-specific part of the TLS data for the current thread. */

View File

@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload) CUDA_ONE_CALL (cuModuleUnload)
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
CUDA_ONE_CALL (cuStreamAddCallback)
CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery) CUDA_ONE_CALL (cuStreamQuery)

View File

@ -54,7 +54,11 @@ typedef enum {
CUDA_ERROR_INVALID_CONTEXT = 201, CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_NOT_FOUND = 500, CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600, CUDA_ERROR_NOT_READY = 600,
CUDA_ERROR_LAUNCH_FAILED = 719 CUDA_ERROR_LAUNCH_FAILED = 719,
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
CUDA_ERROR_NOT_PERMITTED = 800,
CUDA_ERROR_NOT_SUPPORTED = 801,
CUDA_ERROR_UNKNOWN = 999
} CUresult; } CUresult;
typedef enum { typedef enum {
@ -173,6 +177,8 @@ CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule); CUresult cuModuleUnload (CUmodule);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int); CUoccupancyB2DSize, size_t, int);
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
CUresult cuStreamCreate (CUstream *, unsigned); CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2 #define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream); CUresult cuStreamDestroy (CUstream);

File diff suppressed because it is too large Load Diff

View File

@ -177,6 +177,22 @@ gomp_device_copy (struct gomp_device_descr *devicep,
} }
} }
static inline void
goacc_device_copy_async (struct gomp_device_descr *devicep,
bool (*copy_func) (int, void *, const void *, size_t,
struct goacc_asyncqueue *),
const char *dst, void *dstaddr,
const char *src, const void *srcaddr,
size_t size, struct goacc_asyncqueue *aq)
{
if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
{
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
}
}
/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses) /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
host to device memory transfers. */ host to device memory transfers. */
@ -269,8 +285,9 @@ gomp_to_device_kind_p (int kind)
} }
} }
static void attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep, gomp_copy_host2dev (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *d, const void *h, size_t sz, void *d, const void *h, size_t sz,
struct gomp_coalesce_buf *cbuf) struct gomp_coalesce_buf *cbuf)
{ {
@ -299,14 +316,23 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
} }
} }
} }
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz); if (__builtin_expect (aq != NULL, 0))
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
"dev", d, "host", h, sz, aq);
else
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
} }
static void attribute_hidden void
gomp_copy_dev2host (struct gomp_device_descr *devicep, gomp_copy_dev2host (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *h, const void *d, size_t sz) void *h, const void *d, size_t sz)
{ {
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); if (__builtin_expect (aq != NULL, 0))
goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
"host", h, "dev", d, sz, aq);
else
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
} }
static void static void
@ -324,7 +350,8 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
Helper function of gomp_map_vars. */ Helper function of gomp_map_vars. */
static inline void static inline void
gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn, gomp_map_vars_existing (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq, splay_tree_key oldn,
splay_tree_key newn, struct target_var_desc *tgt_var, splay_tree_key newn, struct target_var_desc *tgt_var,
unsigned char kind, struct gomp_coalesce_buf *cbuf) unsigned char kind, struct gomp_coalesce_buf *cbuf)
{ {
@ -346,7 +373,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
} }
if (GOMP_MAP_ALWAYS_TO_P (kind)) if (GOMP_MAP_ALWAYS_TO_P (kind))
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset (void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start), + newn->host_start - oldn->host_start),
(void *) newn->host_start, (void *) newn->host_start,
@ -364,8 +391,8 @@ get_kind (bool short_mapkind, void *kinds, int idx)
} }
static void static void
gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
uintptr_t target_offset, uintptr_t bias, uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
struct gomp_coalesce_buf *cbuf) struct gomp_coalesce_buf *cbuf)
{ {
struct gomp_device_descr *devicep = tgt->device_descr; struct gomp_device_descr *devicep = tgt->device_descr;
@ -376,7 +403,7 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
if (cur_node.host_start == (uintptr_t) NULL) if (cur_node.host_start == (uintptr_t) NULL)
{ {
cur_node.tgt_offset = (uintptr_t) NULL; cur_node.tgt_offset = (uintptr_t) NULL;
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + target_offset), (void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset, (void *) &cur_node.tgt_offset,
sizeof (void *), cbuf); sizeof (void *), cbuf);
@ -398,12 +425,13 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
array section. Now subtract bias to get what we want array section. Now subtract bias to get what we want
to initialize the pointer with. */ to initialize the pointer with. */
cur_node.tgt_offset -= bias; cur_node.tgt_offset -= bias;
gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset), gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset, sizeof (void *), cbuf); (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
} }
static void static void
gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, gomp_map_fields_existing (struct target_mem_desc *tgt,
struct goacc_asyncqueue *aq, splay_tree_key n,
size_t first, size_t i, void **hostaddrs, size_t first, size_t i, void **hostaddrs,
size_t *sizes, void *kinds, size_t *sizes, void *kinds,
struct gomp_coalesce_buf *cbuf) struct gomp_coalesce_buf *cbuf)
@ -423,7 +451,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->tgt == n->tgt && n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset) && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{ {
gomp_map_vars_existing (devicep, n2, &cur_node, gomp_map_vars_existing (devicep, aq, n2, &cur_node,
&tgt->list[i], kind & typemask, cbuf); &tgt->list[i], kind & typemask, cbuf);
return; return;
} }
@ -439,8 +467,8 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->host_start - n->host_start && n2->host_start - n->host_start
== n2->tgt_offset - n->tgt_offset) == n2->tgt_offset - n->tgt_offset)
{ {
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i], gomp_map_vars_existing (devicep, aq, n2, &cur_node,
kind & typemask, cbuf); &tgt->list[i], kind & typemask, cbuf);
return; return;
} }
} }
@ -451,7 +479,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->tgt == n->tgt && n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset) && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{ {
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i], gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
kind & typemask, cbuf); kind & typemask, cbuf);
return; return;
} }
@ -483,10 +511,12 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
return tgt->tgt_start + tgt->list[i].offset; return tgt->tgt_start + tgt->list[i].offset;
} }
attribute_hidden struct target_mem_desc * static inline __attribute__((always_inline)) struct target_mem_desc *
gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, gomp_map_vars_internal (struct gomp_device_descr *devicep,
void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds, struct goacc_asyncqueue *aq, size_t mapnum,
bool short_mapkind, enum gomp_map_vars_kind pragma_kind) void **hostaddrs, void **devaddrs, size_t *sizes,
void *kinds, bool short_mapkind,
enum gomp_map_vars_kind pragma_kind)
{ {
size_t i, tgt_align, tgt_size, not_found_cnt = 0; size_t i, tgt_align, tgt_size, not_found_cnt = 0;
bool has_firstprivate = false; bool has_firstprivate = false;
@ -600,7 +630,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
continue; continue;
} }
for (i = first; i <= last; i++) for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs, gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds, NULL); sizes, kinds, NULL);
i--; i--;
continue; continue;
@ -645,7 +675,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
else else
n = splay_tree_lookup (mem_map, &cur_node); n = splay_tree_lookup (mem_map, &cur_node);
if (n && n->refcount != REFCOUNT_LINK) if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i], gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
kind & typemask, NULL); kind & typemask, NULL);
else else
{ {
@ -756,7 +786,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt_size = (tgt_size + align - 1) & ~(align - 1); tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt->list[i].offset = tgt_size; tgt->list[i].offset = tgt_size;
len = sizes[i]; len = sizes[i];
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + tgt_size), (void *) (tgt->tgt_start + tgt_size),
(void *) hostaddrs[i], len, cbufp); (void *) hostaddrs[i], len, cbufp);
tgt_size += len; tgt_size += len;
@ -790,7 +820,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
continue; continue;
} }
for (i = first; i <= last; i++) for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs, gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds, cbufp); sizes, kinds, cbufp);
i--; i--;
continue; continue;
@ -810,7 +840,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1); cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
if (cur_node.tgt_offset) if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i]; cur_node.tgt_offset -= sizes[i];
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (n->tgt->tgt_start (void *) (n->tgt->tgt_start
+ n->tgt_offset + n->tgt_offset
+ cur_node.host_start + cur_node.host_start
@ -831,7 +861,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
k->host_end = k->host_start + sizeof (void *); k->host_end = k->host_start + sizeof (void *);
splay_tree_key n = splay_tree_lookup (mem_map, k); splay_tree_key n = splay_tree_lookup (mem_map, k);
if (n && n->refcount != REFCOUNT_LINK) if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, k, &tgt->list[i], gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
kind & typemask, cbufp); kind & typemask, cbufp);
else else
{ {
@ -884,18 +914,19 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
case GOMP_MAP_FORCE_TOFROM: case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_TO: case GOMP_MAP_ALWAYS_TO:
case GOMP_MAP_ALWAYS_TOFROM: case GOMP_MAP_ALWAYS_TOFROM:
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start (void *) (tgt->tgt_start
+ k->tgt_offset), + k->tgt_offset),
(void *) k->host_start, (void *) k->host_start,
k->host_end - k->host_start, cbufp); k->host_end - k->host_start, cbufp);
break; break;
case GOMP_MAP_POINTER: case GOMP_MAP_POINTER:
gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start, gomp_map_pointer (tgt, aq,
(uintptr_t) *(void **) k->host_start,
k->tgt_offset, sizes[i], cbufp); k->tgt_offset, sizes[i], cbufp);
break; break;
case GOMP_MAP_TO_PSET: case GOMP_MAP_TO_PSET:
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start (void *) (tgt->tgt_start
+ k->tgt_offset), + k->tgt_offset),
(void *) k->host_start, (void *) k->host_start,
@ -917,7 +948,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt->list[j].always_copy_from = false; tgt->list[j].always_copy_from = false;
if (k->refcount != REFCOUNT_INFINITY) if (k->refcount != REFCOUNT_INFINITY)
k->refcount++; k->refcount++;
gomp_map_pointer (tgt, gomp_map_pointer (tgt, aq,
(uintptr_t) *(void **) hostaddrs[j], (uintptr_t) *(void **) hostaddrs[j],
k->tgt_offset k->tgt_offset
+ ((uintptr_t) hostaddrs[j] + ((uintptr_t) hostaddrs[j]
@ -946,7 +977,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
break; break;
case GOMP_MAP_FORCE_DEVICEPTR: case GOMP_MAP_FORCE_DEVICEPTR:
assert (k->host_end - k->host_start == sizeof (void *)); assert (k->host_end - k->host_start == sizeof (void *));
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start (void *) (tgt->tgt_start
+ k->tgt_offset), + k->tgt_offset),
(void *) k->host_start, (void *) k->host_start,
@ -965,7 +996,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset); void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
/* We intentionally do not use coalescing here, as it's not /* We intentionally do not use coalescing here, as it's not
data allocated by the current call to this function. */ data allocated by the current call to this function. */
gomp_copy_host2dev (devicep, (void *) n->tgt_offset, gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
&tgt_addr, sizeof (void *), NULL); &tgt_addr, sizeof (void *), NULL);
} }
array++; array++;
@ -978,7 +1009,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
for (i = 0; i < mapnum; i++) for (i = 0; i < mapnum; i++)
{ {
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i); cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + i * sizeof (void *)), (void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *), (void *) &cur_node.tgt_offset, sizeof (void *),
cbufp); cbufp);
@ -989,7 +1020,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
{ {
long c = 0; long c = 0;
for (c = 0; c < cbuf.chunk_cnt; ++c) for (c = 0; c < cbuf.chunk_cnt; ++c)
gomp_copy_host2dev (devicep, gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + cbuf.chunks[c].start), (void *) (tgt->tgt_start + cbuf.chunks[c].start),
(char *) cbuf.buf + (cbuf.chunks[c].start (char *) cbuf.buf + (cbuf.chunks[c].start
- cbuf.chunks[0].start), - cbuf.chunks[0].start),
@ -1012,7 +1043,27 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
return tgt; return tgt;
} }
static void attribute_hidden struct target_mem_desc *
gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
{
return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
sizes, kinds, short_mapkind, pragma_kind);
}
attribute_hidden struct target_mem_desc *
gomp_map_vars_async (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq, size_t mapnum,
void **hostaddrs, void **devaddrs, size_t *sizes,
void *kinds, bool short_mapkind,
enum gomp_map_vars_kind pragma_kind)
{
return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
sizes, kinds, short_mapkind, pragma_kind);
}
attribute_hidden void
gomp_unmap_tgt (struct target_mem_desc *tgt) gomp_unmap_tgt (struct target_mem_desc *tgt)
{ {
/* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */ /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */
@ -1040,12 +1091,24 @@ gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
return is_tgt_unmapped; return is_tgt_unmapped;
} }
static void
gomp_unref_tgt (void *ptr)
{
struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
if (tgt->refcount > 1)
tgt->refcount--;
else
gomp_unmap_tgt (tgt);
}
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
variables back from device to host: if it is false, it is assumed that this variables back from device to host: if it is false, it is assumed that this
has been done already. */ has been done already. */
attribute_hidden void static inline __attribute__((always_inline)) void
gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
struct goacc_asyncqueue *aq)
{ {
struct gomp_device_descr *devicep = tgt->device_descr; struct gomp_device_descr *devicep = tgt->device_descr;
@ -1082,7 +1145,7 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from) if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
|| tgt->list[i].always_copy_from) || tgt->list[i].always_copy_from)
gomp_copy_dev2host (devicep, gomp_copy_dev2host (devicep, aq,
(void *) (k->host_start + tgt->list[i].offset), (void *) (k->host_start + tgt->list[i].offset),
(void *) (k->tgt->tgt_start + k->tgt_offset (void *) (k->tgt->tgt_start + k->tgt_offset
+ tgt->list[i].offset), + tgt->list[i].offset),
@ -1091,14 +1154,28 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
gomp_remove_var (devicep, k); gomp_remove_var (devicep, k);
} }
if (tgt->refcount > 1) if (aq)
tgt->refcount--; devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt,
(void *) tgt);
else else
gomp_unmap_tgt (tgt); gomp_unref_tgt ((void *) tgt);
gomp_mutex_unlock (&devicep->lock); gomp_mutex_unlock (&devicep->lock);
} }
attribute_hidden void
gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
{
gomp_unmap_vars_internal (tgt, do_copyfrom, NULL);
}
attribute_hidden void
gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
struct goacc_asyncqueue *aq)
{
gomp_unmap_vars_internal (tgt, do_copyfrom, aq);
}
static void static void
gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t *sizes, void *kinds, bool short_mapkind) size_t *sizes, void *kinds, bool short_mapkind)
@ -1148,9 +1225,10 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t size = cur_node.host_end - cur_node.host_start; size_t size = cur_node.host_end - cur_node.host_start;
if (GOMP_MAP_COPY_TO_P (kind & typemask)) if (GOMP_MAP_COPY_TO_P (kind & typemask))
gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL); gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask)) if (GOMP_MAP_COPY_FROM_P (kind & typemask))
gomp_copy_dev2host (devicep, hostaddr, devaddr, size); gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
} }
} }
gomp_mutex_unlock (&devicep->lock); gomp_mutex_unlock (&devicep->lock);
@ -1443,9 +1521,24 @@ gomp_init_device (struct gomp_device_descr *devicep)
false); false);
} }
/* Initialize OpenACC asynchronous queues. */
goacc_init_asyncqueues (devicep);
devicep->state = GOMP_DEVICE_INITIALIZED; devicep->state = GOMP_DEVICE_INITIALIZED;
} }
/* This function finalizes the target device, specified by DEVICEP. DEVICEP
must be locked on entry, and remains locked on return. */
attribute_hidden bool
gomp_fini_device (struct gomp_device_descr *devicep)
{
bool ret = goacc_fini_asyncqueues (devicep);
ret &= devicep->fini_device_func (devicep->target_id);
devicep->state = GOMP_DEVICE_FINALIZED;
return ret;
}
attribute_hidden void attribute_hidden void
gomp_unload_device (struct gomp_device_descr *devicep) gomp_unload_device (struct gomp_device_descr *devicep)
{ {
@ -1954,7 +2047,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
if ((kind == GOMP_MAP_FROM && k->refcount == 0) if ((kind == GOMP_MAP_FROM && k->refcount == 0)
|| kind == GOMP_MAP_ALWAYS_FROM) || kind == GOMP_MAP_ALWAYS_FROM)
gomp_copy_dev2host (devicep, (void *) cur_node.host_start, gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset (void *) (k->tgt->tgt_start + k->tgt_offset
+ cur_node.host_start + cur_node.host_start
- k->host_start), - k->host_start),
@ -2636,20 +2729,20 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
{ {
if (!DLSYM_OPT (openacc.exec, openacc_exec) if (!DLSYM_OPT (openacc.exec, openacc_exec)
|| !DLSYM_OPT (openacc.register_async_cleanup,
openacc_register_async_cleanup)
|| !DLSYM_OPT (openacc.async_test, openacc_async_test)
|| !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
|| !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
|| !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
|| !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
|| !DLSYM_OPT (openacc.async_wait_all_async,
openacc_async_wait_all_async)
|| !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
|| !DLSYM_OPT (openacc.create_thread_data, || !DLSYM_OPT (openacc.create_thread_data,
openacc_create_thread_data) openacc_create_thread_data)
|| !DLSYM_OPT (openacc.destroy_thread_data, || !DLSYM_OPT (openacc.destroy_thread_data,
openacc_destroy_thread_data)) openacc_destroy_thread_data)
|| !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
|| !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
|| !DLSYM_OPT (openacc.async.test, openacc_async_test)
|| !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
|| !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
|| !DLSYM_OPT (openacc.async.queue_callback,
openacc_async_queue_callback)
|| !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
|| !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
|| !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
{ {
/* Require all the OpenACC handlers if we have /* Require all the OpenACC handlers if we have
GOMP_OFFLOAD_CAP_OPENACC_200. */ GOMP_OFFLOAD_CAP_OPENACC_200. */
@ -2700,10 +2793,7 @@ gomp_target_fini (void)
struct gomp_device_descr *devicep = &devices[i]; struct gomp_device_descr *devicep = &devices[i];
gomp_mutex_lock (&devicep->lock); gomp_mutex_lock (&devicep->lock);
if (devicep->state == GOMP_DEVICE_INITIALIZED) if (devicep->state == GOMP_DEVICE_INITIALIZED)
{ ret = gomp_fini_device (devicep);
ret = devicep->fini_device_func (devicep->target_id);
devicep->state = GOMP_DEVICE_FINALIZED;
}
gomp_mutex_unlock (&devicep->lock); gomp_mutex_unlock (&devicep->lock);
if (!ret) if (!ret)
gomp_fatal ("device finalization failed"); gomp_fatal ("device finalization failed");