Chung-Lin Tang 1f4c5b9bb2 2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
Reviewed-by: Thomas Schwinge <thomas@codesourcery.com>

	libgomp/
	* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
	(struct goacc_asyncqueue_list): Likewise.
	(goacc_aq): Likewise.
	(goacc_aq_list): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_async_test): Remove.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust declaration.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
	(GOMP_OFFLOAD_openacc_async_exec): Declare.
	(GOMP_OFFLOAD_openacc_async_construct): Declare.
	(GOMP_OFFLOAD_openacc_async_destruct): Declare.
	(GOMP_OFFLOAD_openacc_async_test): Declare.
	(GOMP_OFFLOAD_openacc_async_synchronize): Declare.
	(GOMP_OFFLOAD_openacc_async_serialize): Declare.
	(GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
	(GOMP_OFFLOAD_openacc_async_host2dev): Declare.
	(GOMP_OFFLOAD_openacc_async_dev2host): Declare.

	* libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
	(gomp_acc_insert_pointer): Adjust declaration.
	(gomp_copy_host2dev): New declaration.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_async): Likewise.
	(gomp_unmap_tgt): Likewise.
	(gomp_unmap_vars_async): Likewise.
	(gomp_fini_device): Likewise.

	* oacc-async.c (get_goacc_thread): New function.
	(get_goacc_thread_device): New function.
	(lookup_goacc_asyncqueue): New function.
	(get_goacc_asyncqueue): New function.
	(acc_async_test): Adjust code to use new async design.
	(acc_async_test_all): Likewise.
	(acc_wait): Likewise.
	(acc_wait_async): Likewise.
	(acc_wait_all): Likewise.
	(acc_wait_all_async): Likewise.
	(goacc_async_free): New function.
	(goacc_init_asyncqueues): Likewise.
	(goacc_fini_asyncqueues): Likewise.
	* oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
	design.
	(acc_set_cuda_stream): Likewise.
	* oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'.
	(host_openacc_register_async_cleanup): Remove.
	(host_openacc_async_exec): New function.
	(host_openacc_async_test): Adjust parameters.
	(host_openacc_async_test_all): Remove.
	(host_openacc_async_wait): Remove.
	(host_openacc_async_wait_async): Remove.
	(host_openacc_async_wait_all): Remove.
	(host_openacc_async_wait_all_async): Remove.
	(host_openacc_async_set_async): Remove.
	(host_openacc_async_synchronize): New function.
	(host_openacc_async_serialize): New function.
	(host_openacc_async_host2dev): New function.
	(host_openacc_async_dev2host): New function.
	(host_openacc_async_queue_callback): New function.
	(host_openacc_async_construct): New function.
	(host_openacc_async_destruct): New function.
	(struct gomp_device_descr host_dispatch): Remove initialization of old
	interface, add intialization of new async sub-struct.
	* oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
	(goacc_attach_host_thread_to_device): Remove old async code usage.
	* oacc-int.h (goacc_init_asyncqueues): New declaration.
	(goacc_fini_asyncqueues): Likewise.
	(goacc_async_copyout_unmap_vars): Likewise.
	(goacc_async_free): Likewise.
	(get_goacc_asyncqueue): Likewise.
	(lookup_goacc_asyncqueue): Likewise.

	* oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
	design.
	(present_create_copy): Adjust code to use new async design.
	(delete_copyout): Likewise.
	(update_dev_host): Likewise.
	(gomp_acc_insert_pointer): Add async parameter, adjust code to use new
	async design.
	(gomp_acc_remove_pointer): Adjust code to use new async design.
	* oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async
	design.
	(GOACC_enter_exit_data): Likewise.
	(goacc_wait): Likewise.
	(GOACC_update): Likewise.
	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
	when called, warn as obsolete in comment.

	* target.c (goacc_device_copy_async): New function.
	(gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
	add goacc_device_copy_async case.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
	(gomp_map_pointer): Likewise.
	(gomp_map_fields_existing): Likewise.
	(gomp_map_vars_internal): New always_inline function, renamed from
	gomp_map_vars.
	(gomp_map_vars): Implement by calling gomp_map_vars_internal.
	(gomp_map_vars_async): Implement by calling gomp_map_vars_internal,
	passing goacc_asyncqueue argument.
	(gomp_unmap_tgt): Remove static, add attribute_hidden.
	(gomp_unref_tgt): New function.
	(gomp_unmap_vars_internal): New always_inline function, renamed from
	gomp_unmap_vars.
	(gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal.
	(gomp_unmap_vars_async): Implement by calling
	gomp_unmap_vars_internal, passing goacc_asyncqueue argument.
	(gomp_fini_device): New function.
	(gomp_exit_data): Adjust gomp_copy_dev2host call.
	(gomp_load_plugin_for_device): Remove old interface, adjust to load
	new async interface.
	(gomp_target_fini): Adjust code to call gomp_fini_device.

	* plugin/plugin-nvptx.c (struct cuda_map): Remove.
	(struct ptx_stream): Remove.
	(struct nvptx_thread): Remove current_stream field.
	(cuda_map_create): Remove.
	(cuda_map_destroy): Remove.
	(map_init): Remove.
	(map_fini): Remove.
	(map_pop): Remove.
	(map_push): Remove.
	(struct goacc_asyncqueue): Define.
	(struct nvptx_callback): Define.
	(struct ptx_free_block): Define.
	(struct ptx_device): Remove null_stream, active_streams, async_streams,
	stream_lock, and next fields.
	(enum ptx_event_type): Remove.
	(struct ptx_event): Remove.
	(ptx_event_lock): Remove.
	(ptx_events): Remove.
	(init_streams_for_device): Remove.
	(fini_streams_for_device): Remove.
	(select_stream_for_async): Remove.
	(nvptx_init): Remove ptx_events and ptx_event_lock references.
	(nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
	case.
	(nvptx_open_device): Add free_blocks initialization, remove
	init_streams_for_device call.
	(nvptx_close_device): Remove fini_streams_for_device call, add
	free_blocks destruct code.
	(event_gc): Remove.
	(event_add): Remove.
	(nvptx_exec): Adjust parameters and code.
	(nvptx_free): Likewise.
	(nvptx_host2dev): Remove.
	(nvptx_dev2host): Remove.
	(nvptx_set_async): Remove.
	(nvptx_async_test): Remove.
	(nvptx_async_test_all): Remove.
	(nvptx_wait): Remove.
	(nvptx_wait_async): Remove.
	(nvptx_wait_all): Remove.
	(nvptx_wait_all_async): Remove.
	(nvptx_get_cuda_stream): Remove.
	(nvptx_set_cuda_stream): Remove.
	(GOMP_OFFLOAD_alloc): Adjust code.
	(GOMP_OFFLOAD_free): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(cuda_free_argmem): New function.
	(GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
	(GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
	(cuda_callback_wrapper): New function.
	(cuda_memcpy_sanity_check): New function.
	(GOMP_OFFLOAD_host2dev): Remove and re-implement.
	(GOMP_OFFLOAD_dev2host): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.

From-SVN: r271128
2019-05-13 13:32:00 +00:00

190 lines
6.7 KiB
C

/* CUDA API description.
Copyright (C) 2017-2019 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>.
This header provides the minimum amount of typedefs, enums and function
declarations to be able to compile plugin-nvptx.c if cuda.h and
libcuda.so.1 are not available. */
#ifndef GCC_CUDA_H
#define GCC_CUDA_H
#include <stdlib.h>
#define CUDA_VERSION 8000
typedef void *CUcontext;
typedef int CUdevice;
#if defined(__LP64__) || defined(_WIN64)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned CUdeviceptr;
#endif
typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream;
typedef enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
CUDA_ERROR_LAUNCH_FAILED = 719,
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
CUDA_ERROR_NOT_PERMITTED = 800,
CUDA_ERROR_NOT_SUPPORTED = 801,
CUDA_ERROR_UNKNOWN = 999
} CUresult;
typedef enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
} CUdevice_attribute;
enum {
CU_EVENT_DEFAULT = 0,
CU_EVENT_DISABLE_TIMING = 2
};
typedef enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
} CUfunction_attribute;
typedef enum {
CU_JIT_WALL_TIME = 2,
CU_JIT_INFO_LOG_BUFFER = 3,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
CU_JIT_ERROR_LOG_BUFFER = 5,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
CU_JIT_OPTIMIZATION_LEVEL = 7,
CU_JIT_LOG_VERBOSE = 12
} CUjit_option;
typedef enum {
CU_JIT_INPUT_PTX = 1
} CUjitInputType;
enum {
CU_CTX_SCHED_AUTO = 0
};
#define CU_LAUNCH_PARAM_END ((void *) 0)
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
enum {
CU_STREAM_DEFAULT = 0,
CU_STREAM_NON_BLOCKING = 1
};
#define cuCtxCreate cuCtxCreate_v2
CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
#define cuCtxDestroy cuCtxDestroy_v2
CUresult cuCtxDestroy (CUcontext);
CUresult cuCtxGetCurrent (CUcontext *);
CUresult cuCtxGetDevice (CUdevice *);
#define cuCtxPopCurrent cuCtxPopCurrent_v2
CUresult cuCtxPopCurrent (CUcontext *);
#define cuCtxPushCurrent cuCtxPushCurrent_v2
CUresult cuCtxPushCurrent (CUcontext);
CUresult cuCtxSynchronize (void);
CUresult cuDeviceGet (CUdevice *, int);
CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
CUresult cuDeviceGetCount (int *);
CUresult cuEventCreate (CUevent *, unsigned);
#define cuEventDestroy cuEventDestroy_v2
CUresult cuEventDestroy (CUevent);
CUresult cuEventElapsedTime (float *, CUevent, CUevent);
CUresult cuEventQuery (CUevent);
CUresult cuEventRecord (CUevent, CUstream);
CUresult cuEventSynchronize (CUevent);
CUresult cuFuncGetAttribute (int *, CUfunction_attribute, CUfunction);
CUresult cuGetErrorString (CUresult, const char **);
CUresult cuInit (unsigned);
CUresult cuLaunchKernel (CUfunction, unsigned, unsigned, unsigned, unsigned,
unsigned, unsigned, unsigned, CUstream, void **, void **);
#define cuLinkAddData cuLinkAddData_v2
CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *,
unsigned, CUjit_option *, void **);
CUresult cuLinkComplete (CUlinkState, void **, size_t *);
#define cuLinkCreate cuLinkCreate_v2
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
CUresult cuLinkDestroy (CUlinkState);
#define cuMemAlloc cuMemAlloc_v2
CUresult cuMemAlloc (CUdeviceptr *, size_t);
#define cuMemAllocHost cuMemAllocHost_v2
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
#define cuMemcpyDtoH cuMemcpyDtoH_v2
CUresult cuMemcpyDtoH (void *, CUdeviceptr, size_t);
#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
#define cuMemcpyHtoD cuMemcpyHtoD_v2
CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
#define cuMemFree cuMemFree_v2
CUresult cuMemFree (CUdeviceptr);
CUresult cuMemFreeHost (void *);
#define cuMemGetAddressRange cuMemGetAddressRange_v2
CUresult cuMemGetAddressRange (CUdeviceptr *, size_t *, CUdeviceptr);
#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
CUresult cuMemHostGetDevicePointer (CUdeviceptr *, void *, unsigned);
CUresult cuModuleGetFunction (CUfunction *, CUmodule, const char *);
#define cuModuleGetGlobal cuModuleGetGlobal_v2
CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
CUresult cuStreamQuery (CUstream);
CUresult cuStreamSynchronize (CUstream);
CUresult cuStreamWaitEvent (CUstream, CUevent, unsigned);
#endif /* GCC_CUDA_H */