From 2393d337e7c5ff258b1ad167025b9e4d5f518533 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 17 Jan 2017 10:44:17 +0100 Subject: [PATCH] configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. * plugin/configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use plugin/include/cuda as include dir and -ldl instead of -lcuda as library to link ptx plugin against. * plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC. (CUDA_CALLS): Define. (cuda_lib, cuda_lib_inited): New variables. (init_cuda_lib): New function. (CUDA_CALL_PREFIX): Define. (CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX. (CUDA_CALL): Use FN instead of (FN). (CUDA_CALL_NOCHECK): Define. (cuda_error, fini_streams_for_device, select_stream_for_async, nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx, event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all, nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image, nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use CUDA_CALL_NOCHECK. (nvptx_init): Call init_cuda_lib, if it fails, return false. Use CUDA_CALL_NOCHECK. (nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0. Use CUDA_CALL_NOCHECK. * plugin/cuda/cuda.h: New file. * config.h.in: Regenerated. * configure: Regenerated. From-SVN: r244522 --- libgomp/ChangeLog | 28 ++++ libgomp/config.h.in | 4 + libgomp/configure | 30 ++++- libgomp/plugin/configfrag.ac | 27 +++- libgomp/plugin/cuda/cuda.h | 179 +++++++++++++++++++++++++ libgomp/plugin/plugin-nvptx.c | 237 +++++++++++++++++++++++++--------- 6 files changed, 427 insertions(+), 78 deletions(-) create mode 100644 libgomp/plugin/cuda/cuda.h diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 7e40e245648..0e04c7416d1 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,5 +1,33 @@ 2017-01-17 Jakub Jelinek + * plugin/configfrag.ac: For --without-cuda-driver don't initialize + CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both + CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small + cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use + plugin/include/cuda as include dir and -ldl instead of -lcuda as + library to link ptx plugin against. + * plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC. + (CUDA_CALLS): Define. + (cuda_lib, cuda_lib_inited): New variables. + (init_cuda_lib): New function. + (CUDA_CALL_PREFIX): Define. + (CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX. + (CUDA_CALL): Use FN instead of (FN). + (CUDA_CALL_NOCHECK): Define. + (cuda_error, fini_streams_for_device, select_stream_for_async, + nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx, + event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all, + nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image, + nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use + CUDA_CALL_NOCHECK. + (nvptx_init): Call init_cuda_lib, if it fails, return false. Use + CUDA_CALL_NOCHECK. + (nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0. + Use CUDA_CALL_NOCHECK. + * plugin/cuda/cuda.h: New file. + * config.h.in: Regenerated. + * configure: Regenerated. + PR other/79046 * configure.ac: Add GCC_BASE_VER. * Makefile.am (gcc_version): Use @get_gcc_base_ver@ instead of cat to diff --git a/libgomp/config.h.in b/libgomp/config.h.in index 583b9b48250..e7bc4d97374 100644 --- a/libgomp/config.h.in +++ b/libgomp/config.h.in @@ -155,6 +155,10 @@ /* Define to 1 if the NVIDIA plugin is built, 0 if not. */ #undef PLUGIN_NVPTX +/* Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should + be linked against it. */ +#undef PLUGIN_NVPTX_DYNAMIC + /* Define if all infrastructure, needed for plugins, is supported. */ #undef PLUGIN_SUPPORT diff --git a/libgomp/configure b/libgomp/configure index 6a4b3b2a6e0..b7e9f40b852 100755 --- a/libgomp/configure +++ b/libgomp/configure @@ -15299,10 +15299,12 @@ if test "${with_cuda_driver_lib+set}" = set; then : withval=$with_cuda_driver_lib; fi -if test "x$with_cuda_driver" != x; then - CUDA_DRIVER_INCLUDE=$with_cuda_driver/include - CUDA_DRIVER_LIB=$with_cuda_driver/lib -fi +case "x$with_cuda_driver" in + x | xno) ;; + *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include + CUDA_DRIVER_LIB=$with_cuda_driver/lib + ;; +esac if test "x$with_cuda_driver_include" != x; then CUDA_DRIVER_INCLUDE=$with_cuda_driver_include fi @@ -15320,6 +15322,7 @@ PLUGIN_NVPTX=0 PLUGIN_NVPTX_CPPFLAGS= PLUGIN_NVPTX_LDFLAGS= PLUGIN_NVPTX_LIBS= +PLUGIN_NVPTX_DYNAMIC=0 @@ -15426,9 +15429,17 @@ rm -f core conftest.err conftest.$ac_objext \ LIBS=$PLUGIN_NVPTX_save_LIBS case $PLUGIN_NVPTX in nvptx*) - PLUGIN_NVPTX=0 - as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5 - ;; + if test "x$CUDA_DRIVER_INCLUDE" = x \ + && test "x$CUDA_DRIVER_LIB" = x; then + PLUGIN_NVPTX=1 + PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' + PLUGIN_NVPTX_LIBS='-ldl' + PLUGIN_NVPTX_DYNAMIC=1 + else + PLUGIN_NVPTX=0 + as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5 + fi + ;; esac ;; hsa*) @@ -15513,6 +15524,11 @@ cat >>confdefs.h <<_ACEOF #define PLUGIN_NVPTX $PLUGIN_NVPTX _ACEOF + +cat >>confdefs.h <<_ACEOF +#define PLUGIN_NVPTX_DYNAMIC $PLUGIN_NVPTX_DYNAMIC +_ACEOF + if test $PLUGIN_HSA = 1; then PLUGIN_HSA_TRUE= PLUGIN_HSA_FALSE='#' diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac index 579e2c3d000..c4a92795926 100644 --- a/libgomp/plugin/configfrag.ac +++ b/libgomp/plugin/configfrag.ac @@ -58,10 +58,12 @@ AC_ARG_WITH(cuda-driver-include, AC_ARG_WITH(cuda-driver-lib, [AS_HELP_STRING([--with-cuda-driver-lib=PATH], [specify directory for the installed CUDA driver library])]) -if test "x$with_cuda_driver" != x; then - CUDA_DRIVER_INCLUDE=$with_cuda_driver/include - CUDA_DRIVER_LIB=$with_cuda_driver/lib -fi +case "x$with_cuda_driver" in + x | xno) ;; + *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include + CUDA_DRIVER_LIB=$with_cuda_driver/lib + ;; +esac if test "x$with_cuda_driver_include" != x; then CUDA_DRIVER_INCLUDE=$with_cuda_driver_include fi @@ -79,6 +81,7 @@ PLUGIN_NVPTX=0 PLUGIN_NVPTX_CPPFLAGS= PLUGIN_NVPTX_LDFLAGS= PLUGIN_NVPTX_LIBS= +PLUGIN_NVPTX_DYNAMIC=0 AC_SUBST(PLUGIN_NVPTX) AC_SUBST(PLUGIN_NVPTX_CPPFLAGS) AC_SUBST(PLUGIN_NVPTX_LDFLAGS) @@ -167,9 +170,17 @@ if test x"$enable_offload_targets" != x; then LIBS=$PLUGIN_NVPTX_save_LIBS case $PLUGIN_NVPTX in nvptx*) - PLUGIN_NVPTX=0 - AC_MSG_ERROR([CUDA driver package required for nvptx support]) - ;; + if test "x$CUDA_DRIVER_INCLUDE" = x \ + && test "x$CUDA_DRIVER_LIB" = x; then + PLUGIN_NVPTX=1 + PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' + PLUGIN_NVPTX_LIBS='-ldl' + PLUGIN_NVPTX_DYNAMIC=1 + else + PLUGIN_NVPTX=0 + AC_MSG_ERROR([CUDA driver package required for nvptx support]) + fi + ;; esac ;; hsa*) @@ -241,6 +252,8 @@ AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets", AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1]) AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX], [Define to 1 if the NVIDIA plugin is built, 0 if not.]) +AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC], + [Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should be linked against it.]) AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1]) AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA], [Define to 1 if the HSA plugin is built, 0 if not.]) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h new file mode 100644 index 00000000000..eb92b18f745 --- /dev/null +++ b/libgomp/plugin/cuda/cuda.h @@ -0,0 +1,179 @@ +/* CUDA API description. + Copyright (C) 2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. + +This header provides the minimum amount of typedefs, enums and function +declarations to be able to compile plugin-nvptx.c if cuda.h and +libcuda.so.1 are not available. */ + +#ifndef GCC_CUDA_H +#define GCC_CUDA_H + +#include + +#define CUDA_VERSION 8000 + +typedef void *CUcontext; +typedef int CUdevice; +#ifdef __LP64__ +typedef unsigned long long CUdeviceptr; +#else +typedef unsigned CUdeviceptr; +#endif +typedef void *CUevent; +typedef void *CUfunction; +typedef void *CUlinkState; +typedef void *CUmodule; +typedef void *CUstream; + +typedef enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, + CUDA_ERROR_OUT_OF_MEMORY = 2, + CUDA_ERROR_INVALID_CONTEXT = 201, + CUDA_ERROR_NOT_FOUND = 500, + CUDA_ERROR_NOT_READY = 600, + CUDA_ERROR_LAUNCH_FAILED = 719 +} CUresult; + +typedef enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 +} CUdevice_attribute; + +enum { + CU_EVENT_DEFAULT = 0, + CU_EVENT_DISABLE_TIMING = 2 +}; + +typedef enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + CU_FUNC_ATTRIBUTE_NUM_REGS = 4 +} CUfunction_attribute; + +typedef enum { + CU_JIT_WALL_TIME = 2, + CU_JIT_INFO_LOG_BUFFER = 3, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, + CU_JIT_ERROR_LOG_BUFFER = 5, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, + CU_JIT_LOG_VERBOSE = 12 +} CUjit_option; + +typedef enum { + CU_JIT_INPUT_PTX = 1 +} CUjitInputType; + +enum { + CU_CTX_SCHED_AUTO = 0 +}; + +#define CU_LAUNCH_PARAM_END ((void *) 0) +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1) +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2) + +enum { + CU_STREAM_DEFAULT = 0, + CU_STREAM_NON_BLOCKING = 1 +}; + +#define cuCtxCreate cuCtxCreate_v2 +CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice); +#define cuCtxDestroy cuCtxDestroy_v2 +CUresult cuCtxDestroy (CUcontext); +CUresult cuCtxGetCurrent (CUcontext *); +CUresult cuCtxGetDevice (CUdevice *); +#define cuCtxPopCurrent cuCtxPopCurrent_v2 +CUresult cuCtxPopCurrent (CUcontext *); +#define cuCtxPushCurrent cuCtxPushCurrent_v2 +CUresult cuCtxPushCurrent (CUcontext); +CUresult cuCtxSynchronize (void); +CUresult cuDeviceGet (CUdevice *, int); +CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); +CUresult cuDeviceGetCount (int *); +CUresult cuEventCreate (CUevent *, unsigned); +#define cuEventDestroy cuEventDestroy_v2 +CUresult cuEventDestroy (CUevent); +CUresult cuEventElapsedTime (float *, CUevent, CUevent); +CUresult cuEventQuery (CUevent); +CUresult cuEventRecord (CUevent, CUstream); +CUresult cuEventSynchronize (CUevent); +CUresult cuFuncGetAttribute (int *, CUfunction_attribute, CUfunction); +CUresult cuGetErrorString (CUresult, const char **); +CUresult cuInit (unsigned); +CUresult cuLaunchKernel (CUfunction, unsigned, unsigned, unsigned, unsigned, + unsigned, unsigned, unsigned, CUstream, void **, void **); +#define cuLinkAddData cuLinkAddData_v2 +CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *, + unsigned, CUjit_option *, void **); +CUresult cuLinkComplete (CUlinkState, void **, size_t *); +#define cuLinkCreate cuLinkCreate_v2 +CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); +CUresult cuLinkDestroy (CUlinkState); +#define cuMemAlloc cuMemAlloc_v2 +CUresult cuMemAlloc (CUdeviceptr *, size_t); +#define cuMemAllocHost cuMemAllocHost_v2 +CUresult cuMemAllocHost (void **, size_t); +CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); +#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 +CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream); +#define cuMemcpyDtoH cuMemcpyDtoH_v2 +CUresult cuMemcpyDtoH (void *, CUdeviceptr, size_t); +#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 +CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream); +#define cuMemcpyHtoD cuMemcpyHtoD_v2 +CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t); +#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 +CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream); +#define cuMemFree cuMemFree_v2 +CUresult cuMemFree (CUdeviceptr); +CUresult cuMemFreeHost (void *); +#define cuMemGetAddressRange cuMemGetAddressRange_v2 +CUresult cuMemGetAddressRange (CUdeviceptr *, size_t *, CUdeviceptr); +#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 +CUresult cuMemHostGetDevicePointer (CUdeviceptr *, void *, unsigned); +CUresult cuModuleGetFunction (CUfunction *, CUmodule, const char *); +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); +CUresult cuModuleLoad (CUmodule *, const char *); +CUresult cuModuleLoadData (CUmodule *, const void *); +CUresult cuModuleUnload (CUmodule); +CUresult cuStreamCreate (CUstream *, unsigned); +#define cuStreamDestroy cuStreamDestroy_v2 +CUresult cuStreamDestroy (CUstream); +CUresult cuStreamQuery (CUstream); +CUresult cuStreamSynchronize (CUstream); +CUresult cuStreamWaitEvent (CUstream, CUevent, unsigned); + +#endif /* GCC_CUDA_H */ diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a2e1aeb8a3e..4144218ae8f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -48,6 +48,129 @@ #include #include +#if PLUGIN_NVPTX_DYNAMIC +# include + +# define CUDA_CALLS \ +CUDA_ONE_CALL (cuCtxCreate) \ +CUDA_ONE_CALL (cuCtxDestroy) \ +CUDA_ONE_CALL (cuCtxGetCurrent) \ +CUDA_ONE_CALL (cuCtxGetDevice) \ +CUDA_ONE_CALL (cuCtxPopCurrent) \ +CUDA_ONE_CALL (cuCtxPushCurrent) \ +CUDA_ONE_CALL (cuCtxSynchronize) \ +CUDA_ONE_CALL (cuDeviceGet) \ +CUDA_ONE_CALL (cuDeviceGetAttribute) \ +CUDA_ONE_CALL (cuDeviceGetCount) \ +CUDA_ONE_CALL (cuEventCreate) \ +CUDA_ONE_CALL (cuEventDestroy) \ +CUDA_ONE_CALL (cuEventElapsedTime) \ +CUDA_ONE_CALL (cuEventQuery) \ +CUDA_ONE_CALL (cuEventRecord) \ +CUDA_ONE_CALL (cuEventSynchronize) \ +CUDA_ONE_CALL (cuFuncGetAttribute) \ +CUDA_ONE_CALL (cuGetErrorString) \ +CUDA_ONE_CALL (cuInit) \ +CUDA_ONE_CALL (cuLaunchKernel) \ +CUDA_ONE_CALL (cuLinkAddData) \ +CUDA_ONE_CALL (cuLinkComplete) \ +CUDA_ONE_CALL (cuLinkCreate) \ +CUDA_ONE_CALL (cuLinkDestroy) \ +CUDA_ONE_CALL (cuMemAlloc) \ +CUDA_ONE_CALL (cuMemAllocHost) \ +CUDA_ONE_CALL (cuMemcpy) \ +CUDA_ONE_CALL (cuMemcpyDtoDAsync) \ +CUDA_ONE_CALL (cuMemcpyDtoH) \ +CUDA_ONE_CALL (cuMemcpyDtoHAsync) \ +CUDA_ONE_CALL (cuMemcpyHtoD) \ +CUDA_ONE_CALL (cuMemcpyHtoDAsync) \ +CUDA_ONE_CALL (cuMemFree) \ +CUDA_ONE_CALL (cuMemFreeHost) \ +CUDA_ONE_CALL (cuMemGetAddressRange) \ +CUDA_ONE_CALL (cuMemHostGetDevicePointer)\ +CUDA_ONE_CALL (cuModuleGetFunction) \ +CUDA_ONE_CALL (cuModuleGetGlobal) \ +CUDA_ONE_CALL (cuModuleLoad) \ +CUDA_ONE_CALL (cuModuleLoadData) \ +CUDA_ONE_CALL (cuModuleUnload) \ +CUDA_ONE_CALL (cuStreamCreate) \ +CUDA_ONE_CALL (cuStreamDestroy) \ +CUDA_ONE_CALL (cuStreamQuery) \ +CUDA_ONE_CALL (cuStreamSynchronize) \ +CUDA_ONE_CALL (cuStreamWaitEvent) +# define CUDA_ONE_CALL(call) \ + __typeof (call) *call; +struct cuda_lib_s { + CUDA_CALLS +} cuda_lib; + +/* -1 if init_cuda_lib has not been called yet, false + if it has been and failed, true if it has been and succeeded. */ +static char cuda_lib_inited = -1; + +/* Dynamically load the CUDA runtime library and initialize function + pointers, return false if unsuccessful, true if successful. */ +static bool +init_cuda_lib (void) +{ + if (cuda_lib_inited != -1) + return cuda_lib_inited; + const char *cuda_runtime_lib = "libcuda.so.1"; + void *h = dlopen (cuda_runtime_lib, RTLD_LAZY); + cuda_lib_inited = false; + if (h == NULL) + return false; +# undef CUDA_ONE_CALL +# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) +# define CUDA_ONE_CALL_1(call) \ + cuda_lib.call = dlsym (h, #call); \ + if (cuda_lib.call == NULL) \ + return false; + CUDA_CALLS + cuda_lib_inited = true; + return true; +} +# undef CUDA_ONE_CALL +# undef CUDA_ONE_CALL_1 +# define CUDA_CALL_PREFIX cuda_lib. +#else +# define CUDA_CALL_PREFIX +# define init_cuda_lib() true +#endif + +/* Convenience macros for the frequently used CUDA library call and + error handling sequence as well as CUDA library calls that + do the error checking themselves or don't do it at all. */ + +#define CUDA_CALL_ERET(ERET, FN, ...) \ + do { \ + unsigned __r \ + = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_error (#FN " error: %s", \ + cuda_error (__r)); \ + return ERET; \ + } \ + } while (0) + +#define CUDA_CALL(FN, ...) \ + CUDA_CALL_ERET (false, FN, __VA_ARGS__) + +#define CUDA_CALL_ASSERT(FN, ...) \ + do { \ + unsigned __r \ + = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_fatal (#FN " error: %s", \ + cuda_error (__r)); \ + } \ + } while (0) + +#define CUDA_CALL_NOCHECK(FN, ...) \ + CUDA_CALL_PREFIX FN (__VA_ARGS__) + static const char * cuda_error (CUresult r) { @@ -58,41 +181,13 @@ cuda_error (CUresult r) #endif const char *desc; - r = cuGetErrorString (r, &desc); + r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); if (r != CUDA_SUCCESS) desc = "unknown cuda error"; return desc; } -/* Convenience macros for the frequently used CUDA library call and - error handling sequence. This does not capture all the cases we - use in this file, but is common enough. */ - -#define CUDA_CALL_ERET(ERET, FN, ...) \ - do { \ - unsigned __r = FN (__VA_ARGS__); \ - if (__r != CUDA_SUCCESS) \ - { \ - GOMP_PLUGIN_error (#FN " error: %s", \ - cuda_error (__r)); \ - return ERET; \ - } \ - } while (0) - -#define CUDA_CALL(FN, ...) \ - CUDA_CALL_ERET (false, (FN), __VA_ARGS__) - -#define CUDA_CALL_ASSERT(FN, ...) \ - do { \ - unsigned __r = FN (__VA_ARGS__); \ - if (__r != CUDA_SUCCESS) \ - { \ - GOMP_PLUGIN_fatal (#FN " error: %s", \ - cuda_error (__r)); \ - } \ - } while (0) - static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; @@ -401,7 +496,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev) ret &= map_fini (s); - CUresult r = cuStreamDestroy (s->stream); + CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); @@ -484,7 +579,8 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->stream = existing; else { - r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); + r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream, + CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { pthread_mutex_unlock (&ptx_dev->stream_lock); @@ -554,10 +650,14 @@ nvptx_init (void) if (instantiated_devices != 0) return true; - CUDA_CALL (cuInit, 0); ptx_events = NULL; pthread_mutex_init (&ptx_event_lock, NULL); + if (!init_cuda_lib ()) + return false; + + CUDA_CALL (cuInit, 0); + CUDA_CALL (cuDeviceGetCount, &ndevs); ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) * ndevs); @@ -575,7 +675,7 @@ nvptx_attach_host_thread_to_device (int n) struct ptx_device *ptx_dev; CUcontext thd_ctx; - r = cuCtxGetDevice (&dev); + r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); @@ -623,7 +723,7 @@ nvptx_open_device (int n) ptx_dev->dev = dev; ptx_dev->ctx_shared = false; - r = cuCtxGetDevice (&ctx_dev); + r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); @@ -669,7 +769,7 @@ nvptx_open_device (int n) &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); ptx_dev->clock_khz = pi; - CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); ptx_dev->num_sms = pi; @@ -679,7 +779,7 @@ nvptx_open_device (int n) /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only in CUDA 6.0 and newer. */ - r = cuDeviceGetAttribute (&pi, 82, dev); + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev); /* Fallback: use limit of registers per block, which is usually equal. */ if (r == CUDA_ERROR_INVALID_VALUE) pi = ptx_dev->regs_per_block; @@ -698,8 +798,8 @@ nvptx_open_device (int n) return NULL; } - r = cuDeviceGetAttribute (&async_engines, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; @@ -746,7 +846,9 @@ nvptx_get_num_devices (void) further initialization). */ if (instantiated_devices == 0) { - CUresult r = cuInit (0); + if (!init_cuda_lib ()) + return 0; + CUresult r = CUDA_CALL_NOCHECK (cuInit, 0); /* This is not an error: e.g. we may have CUDA libraries installed but no devices available. */ if (r != CUDA_SUCCESS) @@ -797,8 +899,9 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, - ptx_objs->size, 0, 0, 0, 0); + r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, + (char *) ptx_objs->code, ptx_objs->size, + 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); @@ -809,7 +912,7 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, } GOMP_PLUGIN_debug (0, "Linking\n"); - r = cuLinkComplete (linkstate, &linkout, &linkoutsize); + r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); @@ -844,7 +947,7 @@ event_gc (bool memmap_lockable) if (e->ord != nvthd->ptx_dev->ord) continue; - r = cuEventQuery (*e->evt); + r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt); if (r == CUDA_SUCCESS) { bool append_async = false; @@ -877,7 +980,7 @@ event_gc (bool memmap_lockable) break; } - cuEventDestroy (*te); + CUDA_CALL_NOCHECK (cuEventDestroy, *te); free ((void *)te); /* Unlink 'e' from ptx_events list. */ @@ -1015,10 +1118,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; - if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS) + if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm, + dev) == CUDA_SUCCESS) { GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," " dev_size=%d, cpu_size=%d\n", @@ -1090,7 +1197,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, #ifndef DISABLE_ASYNC if (async < acc_async_noval) { - r = cuStreamSynchronize (dev_str->stream); + r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1103,7 +1210,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1117,7 +1224,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); } #else - r = cuCtxSynchronize (); + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1294,7 +1401,7 @@ nvptx_async_test (int async) if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); - r = cuStreamQuery (s->stream); + r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); if (r == CUDA_SUCCESS) { /* The oacc-parallel.c:goacc_wait function calls this hook to determine @@ -1325,7 +1432,8 @@ nvptx_async_test_all (void) for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) - && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) + && CUDA_CALL_NOCHECK (cuStreamQuery, + s->stream) == CUDA_ERROR_NOT_READY) { pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); return 0; @@ -1400,7 +1508,7 @@ nvptx_wait_all (void) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { - r = cuStreamQuery (s->stream); + r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) @@ -1632,13 +1740,15 @@ static void nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) { CUdeviceptr dptr; - CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick"); + CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL, + module, "__nvptx_clocktick"); if (r == CUDA_ERROR_NOT_FOUND) return; if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); double __nvptx_clocktick = 1e-3 / dev->clock_khz; - r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick)); + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick, + sizeof (__nvptx_clocktick)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } @@ -1761,7 +1871,7 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) if (image->target_data == target_data) { *prev_p = image->next; - if (cuModuleUnload (image->module) != CUDA_SUCCESS) + if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) ret = false; free (image->fns); free (image); @@ -1974,7 +2084,7 @@ static void * nvptx_stacks_alloc (size_t size, int num) { CUdeviceptr stacks; - CUresult r = cuMemAlloc (&stacks, size * num); + CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); return (void *) stacks; @@ -1985,7 +2095,7 @@ nvptx_stacks_alloc (size_t size, int num) static void nvptx_stacks_free (void *p, int num) { - CUresult r = cuMemFree ((CUdeviceptr) p); + CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); } @@ -2028,14 +2138,13 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size, CU_LAUNCH_PARAM_END }; - r = cuLaunchKernel (function, - teams, 1, 1, - 32, threads, 1, - 0, ptx_dev->null_stream->stream, NULL, config); + r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, + 32, threads, 1, 0, ptx_dev->null_stream->stream, + NULL, config); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); - r = cuCtxSynchronize (); + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg);