configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB.

* plugin/configfrag.ac: For --without-cuda-driver don't initialize
	CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB.  If both
	CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small
	cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use
	plugin/include/cuda as include dir and -ldl instead of -lcuda as
	library to link ptx plugin against.
	* plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC.
	(CUDA_CALLS): Define.
	(cuda_lib, cuda_lib_inited): New variables.
	(init_cuda_lib): New function.
	(CUDA_CALL_PREFIX): Define.
	(CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX.
	(CUDA_CALL): Use FN instead of (FN).
	(CUDA_CALL_NOCHECK): Define.
	(cuda_error, fini_streams_for_device, select_stream_for_async,
	nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx,
	event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all,
	nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image,
	nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use
	CUDA_CALL_NOCHECK.
	(nvptx_init): Call init_cuda_lib, if it fails, return false.  Use
	CUDA_CALL_NOCHECK.
	(nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0.
	Use CUDA_CALL_NOCHECK.
	* plugin/cuda/cuda.h: New file.
	* config.h.in: Regenerated.
	* configure: Regenerated.

From-SVN: r244522
This commit is contained in:
Jakub Jelinek 2017-01-17 10:44:17 +01:00 committed by Jakub Jelinek
parent 3c36aa6ba2
commit 2393d337e7
6 changed files with 427 additions and 78 deletions

View File

@ -1,5 +1,33 @@
2017-01-17 Jakub Jelinek <jakub@redhat.com>
* plugin/configfrag.ac: For --without-cuda-driver don't initialize
CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both
CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small
cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use
plugin/include/cuda as include dir and -ldl instead of -lcuda as
library to link ptx plugin against.
* plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC.
(CUDA_CALLS): Define.
(cuda_lib, cuda_lib_inited): New variables.
(init_cuda_lib): New function.
(CUDA_CALL_PREFIX): Define.
(CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX.
(CUDA_CALL): Use FN instead of (FN).
(CUDA_CALL_NOCHECK): Define.
(cuda_error, fini_streams_for_device, select_stream_for_async,
nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx,
event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all,
nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image,
nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use
CUDA_CALL_NOCHECK.
(nvptx_init): Call init_cuda_lib, if it fails, return false. Use
CUDA_CALL_NOCHECK.
(nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0.
Use CUDA_CALL_NOCHECK.
* plugin/cuda/cuda.h: New file.
* config.h.in: Regenerated.
* configure: Regenerated.
PR other/79046
* configure.ac: Add GCC_BASE_VER.
* Makefile.am (gcc_version): Use @get_gcc_base_ver@ instead of cat to

View File

@ -155,6 +155,10 @@
/* Define to 1 if the NVIDIA plugin is built, 0 if not. */
#undef PLUGIN_NVPTX
/* Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should
be linked against it. */
#undef PLUGIN_NVPTX_DYNAMIC
/* Define if all infrastructure, needed for plugins, is supported. */
#undef PLUGIN_SUPPORT

30
libgomp/configure vendored
View File

@ -15299,10 +15299,12 @@ if test "${with_cuda_driver_lib+set}" = set; then :
withval=$with_cuda_driver_lib;
fi
if test "x$with_cuda_driver" != x; then
CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
CUDA_DRIVER_LIB=$with_cuda_driver/lib
fi
case "x$with_cuda_driver" in
x | xno) ;;
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
CUDA_DRIVER_LIB=$with_cuda_driver/lib
;;
esac
if test "x$with_cuda_driver_include" != x; then
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
fi
@ -15320,6 +15322,7 @@ PLUGIN_NVPTX=0
PLUGIN_NVPTX_CPPFLAGS=
PLUGIN_NVPTX_LDFLAGS=
PLUGIN_NVPTX_LIBS=
PLUGIN_NVPTX_DYNAMIC=0
@ -15426,9 +15429,17 @@ rm -f core conftest.err conftest.$ac_objext \
LIBS=$PLUGIN_NVPTX_save_LIBS
case $PLUGIN_NVPTX in
nvptx*)
PLUGIN_NVPTX=0
as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5
;;
if test "x$CUDA_DRIVER_INCLUDE" = x \
&& test "x$CUDA_DRIVER_LIB" = x; then
PLUGIN_NVPTX=1
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
PLUGIN_NVPTX_LIBS='-ldl'
PLUGIN_NVPTX_DYNAMIC=1
else
PLUGIN_NVPTX=0
as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5
fi
;;
esac
;;
hsa*)
@ -15513,6 +15524,11 @@ cat >>confdefs.h <<_ACEOF
#define PLUGIN_NVPTX $PLUGIN_NVPTX
_ACEOF
cat >>confdefs.h <<_ACEOF
#define PLUGIN_NVPTX_DYNAMIC $PLUGIN_NVPTX_DYNAMIC
_ACEOF
if test $PLUGIN_HSA = 1; then
PLUGIN_HSA_TRUE=
PLUGIN_HSA_FALSE='#'

View File

@ -58,10 +58,12 @@ AC_ARG_WITH(cuda-driver-include,
AC_ARG_WITH(cuda-driver-lib,
[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
[specify directory for the installed CUDA driver library])])
if test "x$with_cuda_driver" != x; then
CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
CUDA_DRIVER_LIB=$with_cuda_driver/lib
fi
case "x$with_cuda_driver" in
x | xno) ;;
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
CUDA_DRIVER_LIB=$with_cuda_driver/lib
;;
esac
if test "x$with_cuda_driver_include" != x; then
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
fi
@ -79,6 +81,7 @@ PLUGIN_NVPTX=0
PLUGIN_NVPTX_CPPFLAGS=
PLUGIN_NVPTX_LDFLAGS=
PLUGIN_NVPTX_LIBS=
PLUGIN_NVPTX_DYNAMIC=0
AC_SUBST(PLUGIN_NVPTX)
AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
@ -167,9 +170,17 @@ if test x"$enable_offload_targets" != x; then
LIBS=$PLUGIN_NVPTX_save_LIBS
case $PLUGIN_NVPTX in
nvptx*)
PLUGIN_NVPTX=0
AC_MSG_ERROR([CUDA driver package required for nvptx support])
;;
if test "x$CUDA_DRIVER_INCLUDE" = x \
&& test "x$CUDA_DRIVER_LIB" = x; then
PLUGIN_NVPTX=1
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
PLUGIN_NVPTX_LIBS='-ldl'
PLUGIN_NVPTX_DYNAMIC=1
else
PLUGIN_NVPTX=0
AC_MSG_ERROR([CUDA driver package required for nvptx support])
fi
;;
esac
;;
hsa*)
@ -241,6 +252,8 @@ AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
[Define to 1 if the NVIDIA plugin is built, 0 if not.])
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC],
[Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should be linked against it.])
AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
[Define to 1 if the HSA plugin is built, 0 if not.])

179
libgomp/plugin/cuda/cuda.h Normal file
View File

@ -0,0 +1,179 @@
/* CUDA API description.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>.
This header provides the minimum amount of typedefs, enums and function
declarations to be able to compile plugin-nvptx.c if cuda.h and
libcuda.so.1 are not available. */
#ifndef GCC_CUDA_H
#define GCC_CUDA_H
#include <stdlib.h>
#define CUDA_VERSION 8000
typedef void *CUcontext;
typedef int CUdevice;
#ifdef __LP64__
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned CUdeviceptr;
#endif
typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
typedef void *CUstream;
typedef enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
CUDA_ERROR_LAUNCH_FAILED = 719
} CUresult;
typedef enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
} CUdevice_attribute;
enum {
CU_EVENT_DEFAULT = 0,
CU_EVENT_DISABLE_TIMING = 2
};
typedef enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
} CUfunction_attribute;
typedef enum {
CU_JIT_WALL_TIME = 2,
CU_JIT_INFO_LOG_BUFFER = 3,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
CU_JIT_ERROR_LOG_BUFFER = 5,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
CU_JIT_LOG_VERBOSE = 12
} CUjit_option;
typedef enum {
CU_JIT_INPUT_PTX = 1
} CUjitInputType;
enum {
CU_CTX_SCHED_AUTO = 0
};
#define CU_LAUNCH_PARAM_END ((void *) 0)
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
enum {
CU_STREAM_DEFAULT = 0,
CU_STREAM_NON_BLOCKING = 1
};
#define cuCtxCreate cuCtxCreate_v2
CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
#define cuCtxDestroy cuCtxDestroy_v2
CUresult cuCtxDestroy (CUcontext);
CUresult cuCtxGetCurrent (CUcontext *);
CUresult cuCtxGetDevice (CUdevice *);
#define cuCtxPopCurrent cuCtxPopCurrent_v2
CUresult cuCtxPopCurrent (CUcontext *);
#define cuCtxPushCurrent cuCtxPushCurrent_v2
CUresult cuCtxPushCurrent (CUcontext);
CUresult cuCtxSynchronize (void);
CUresult cuDeviceGet (CUdevice *, int);
CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
CUresult cuDeviceGetCount (int *);
CUresult cuEventCreate (CUevent *, unsigned);
#define cuEventDestroy cuEventDestroy_v2
CUresult cuEventDestroy (CUevent);
CUresult cuEventElapsedTime (float *, CUevent, CUevent);
CUresult cuEventQuery (CUevent);
CUresult cuEventRecord (CUevent, CUstream);
CUresult cuEventSynchronize (CUevent);
CUresult cuFuncGetAttribute (int *, CUfunction_attribute, CUfunction);
CUresult cuGetErrorString (CUresult, const char **);
CUresult cuInit (unsigned);
CUresult cuLaunchKernel (CUfunction, unsigned, unsigned, unsigned, unsigned,
unsigned, unsigned, unsigned, CUstream, void **, void **);
#define cuLinkAddData cuLinkAddData_v2
CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *,
unsigned, CUjit_option *, void **);
CUresult cuLinkComplete (CUlinkState, void **, size_t *);
#define cuLinkCreate cuLinkCreate_v2
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
CUresult cuLinkDestroy (CUlinkState);
#define cuMemAlloc cuMemAlloc_v2
CUresult cuMemAlloc (CUdeviceptr *, size_t);
#define cuMemAllocHost cuMemAllocHost_v2
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
#define cuMemcpyDtoH cuMemcpyDtoH_v2
CUresult cuMemcpyDtoH (void *, CUdeviceptr, size_t);
#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
#define cuMemcpyHtoD cuMemcpyHtoD_v2
CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
#define cuMemFree cuMemFree_v2
CUresult cuMemFree (CUdeviceptr);
CUresult cuMemFreeHost (void *);
#define cuMemGetAddressRange cuMemGetAddressRange_v2
CUresult cuMemGetAddressRange (CUdeviceptr *, size_t *, CUdeviceptr);
#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
CUresult cuMemHostGetDevicePointer (CUdeviceptr *, void *, unsigned);
CUresult cuModuleGetFunction (CUfunction *, CUmodule, const char *);
#define cuModuleGetGlobal cuModuleGetGlobal_v2
CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
CUresult cuStreamQuery (CUstream);
CUresult cuStreamSynchronize (CUstream);
CUresult cuStreamWaitEvent (CUstream, CUevent, unsigned);
#endif /* GCC_CUDA_H */

View File

@ -48,6 +48,129 @@
#include <assert.h>
#include <errno.h>
#if PLUGIN_NVPTX_DYNAMIC
# include <dlfcn.h>
# define CUDA_CALLS \
CUDA_ONE_CALL (cuCtxCreate) \
CUDA_ONE_CALL (cuCtxDestroy) \
CUDA_ONE_CALL (cuCtxGetCurrent) \
CUDA_ONE_CALL (cuCtxGetDevice) \
CUDA_ONE_CALL (cuCtxPopCurrent) \
CUDA_ONE_CALL (cuCtxPushCurrent) \
CUDA_ONE_CALL (cuCtxSynchronize) \
CUDA_ONE_CALL (cuDeviceGet) \
CUDA_ONE_CALL (cuDeviceGetAttribute) \
CUDA_ONE_CALL (cuDeviceGetCount) \
CUDA_ONE_CALL (cuEventCreate) \
CUDA_ONE_CALL (cuEventDestroy) \
CUDA_ONE_CALL (cuEventElapsedTime) \
CUDA_ONE_CALL (cuEventQuery) \
CUDA_ONE_CALL (cuEventRecord) \
CUDA_ONE_CALL (cuEventSynchronize) \
CUDA_ONE_CALL (cuFuncGetAttribute) \
CUDA_ONE_CALL (cuGetErrorString) \
CUDA_ONE_CALL (cuInit) \
CUDA_ONE_CALL (cuLaunchKernel) \
CUDA_ONE_CALL (cuLinkAddData) \
CUDA_ONE_CALL (cuLinkComplete) \
CUDA_ONE_CALL (cuLinkCreate) \
CUDA_ONE_CALL (cuLinkDestroy) \
CUDA_ONE_CALL (cuMemAlloc) \
CUDA_ONE_CALL (cuMemAllocHost) \
CUDA_ONE_CALL (cuMemcpy) \
CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
CUDA_ONE_CALL (cuMemcpyDtoH) \
CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
CUDA_ONE_CALL (cuMemcpyHtoD) \
CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
CUDA_ONE_CALL (cuMemFree) \
CUDA_ONE_CALL (cuMemFreeHost) \
CUDA_ONE_CALL (cuMemGetAddressRange) \
CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
CUDA_ONE_CALL (cuModuleGetFunction) \
CUDA_ONE_CALL (cuModuleGetGlobal) \
CUDA_ONE_CALL (cuModuleLoad) \
CUDA_ONE_CALL (cuModuleLoadData) \
CUDA_ONE_CALL (cuModuleUnload) \
CUDA_ONE_CALL (cuStreamCreate) \
CUDA_ONE_CALL (cuStreamDestroy) \
CUDA_ONE_CALL (cuStreamQuery) \
CUDA_ONE_CALL (cuStreamSynchronize) \
CUDA_ONE_CALL (cuStreamWaitEvent)
# define CUDA_ONE_CALL(call) \
__typeof (call) *call;
struct cuda_lib_s {
CUDA_CALLS
} cuda_lib;
/* -1 if init_cuda_lib has not been called yet, false
if it has been and failed, true if it has been and succeeded. */
static char cuda_lib_inited = -1;
/* Dynamically load the CUDA runtime library and initialize function
pointers, return false if unsuccessful, true if successful. */
static bool
init_cuda_lib (void)
{
if (cuda_lib_inited != -1)
return cuda_lib_inited;
const char *cuda_runtime_lib = "libcuda.so.1";
void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
cuda_lib_inited = false;
if (h == NULL)
return false;
# undef CUDA_ONE_CALL
# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
# define CUDA_ONE_CALL_1(call) \
cuda_lib.call = dlsym (h, #call); \
if (cuda_lib.call == NULL) \
return false;
CUDA_CALLS
cuda_lib_inited = true;
return true;
}
# undef CUDA_ONE_CALL
# undef CUDA_ONE_CALL_1
# define CUDA_CALL_PREFIX cuda_lib.
#else
# define CUDA_CALL_PREFIX
# define init_cuda_lib() true
#endif
/* Convenience macros for the frequently used CUDA library call and
error handling sequence as well as CUDA library calls that
do the error checking themselves or don't do it at all. */
#define CUDA_CALL_ERET(ERET, FN, ...) \
do { \
unsigned __r \
= CUDA_CALL_PREFIX FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_error (#FN " error: %s", \
cuda_error (__r)); \
return ERET; \
} \
} while (0)
#define CUDA_CALL(FN, ...) \
CUDA_CALL_ERET (false, FN, __VA_ARGS__)
#define CUDA_CALL_ASSERT(FN, ...) \
do { \
unsigned __r \
= CUDA_CALL_PREFIX FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_fatal (#FN " error: %s", \
cuda_error (__r)); \
} \
} while (0)
#define CUDA_CALL_NOCHECK(FN, ...) \
CUDA_CALL_PREFIX FN (__VA_ARGS__)
static const char *
cuda_error (CUresult r)
{
@ -58,41 +181,13 @@ cuda_error (CUresult r)
#endif
const char *desc;
r = cuGetErrorString (r, &desc);
r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
if (r != CUDA_SUCCESS)
desc = "unknown cuda error";
return desc;
}
/* Convenience macros for the frequently used CUDA library call and
error handling sequence. This does not capture all the cases we
use in this file, but is common enough. */
#define CUDA_CALL_ERET(ERET, FN, ...) \
do { \
unsigned __r = FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_error (#FN " error: %s", \
cuda_error (__r)); \
return ERET; \
} \
} while (0)
#define CUDA_CALL(FN, ...) \
CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
#define CUDA_CALL_ASSERT(FN, ...) \
do { \
unsigned __r = FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_fatal (#FN " error: %s", \
cuda_error (__r)); \
} \
} while (0)
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
@ -401,7 +496,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
ret &= map_fini (s);
CUresult r = cuStreamDestroy (s->stream);
CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
@ -484,7 +579,8 @@ select_stream_for_async (int async, pthread_t thread, bool create,
s->stream = existing;
else
{
r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
pthread_mutex_unlock (&ptx_dev->stream_lock);
@ -554,10 +650,14 @@ nvptx_init (void)
if (instantiated_devices != 0)
return true;
CUDA_CALL (cuInit, 0);
ptx_events = NULL;
pthread_mutex_init (&ptx_event_lock, NULL);
if (!init_cuda_lib ())
return false;
CUDA_CALL (cuInit, 0);
CUDA_CALL (cuDeviceGetCount, &ndevs);
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
* ndevs);
@ -575,7 +675,7 @@ nvptx_attach_host_thread_to_device (int n)
struct ptx_device *ptx_dev;
CUcontext thd_ctx;
r = cuCtxGetDevice (&dev);
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@ -623,7 +723,7 @@ nvptx_open_device (int n)
ptx_dev->dev = dev;
ptx_dev->ctx_shared = false;
r = cuCtxGetDevice (&ctx_dev);
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@ -669,7 +769,7 @@ nvptx_open_device (int n)
&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
ptx_dev->clock_khz = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
ptx_dev->num_sms = pi;
@ -679,7 +779,7 @@ nvptx_open_device (int n)
/* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
in CUDA 6.0 and newer. */
r = cuDeviceGetAttribute (&pi, 82, dev);
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
/* Fallback: use limit of registers per block, which is usually equal. */
if (r == CUDA_ERROR_INVALID_VALUE)
pi = ptx_dev->regs_per_block;
@ -698,8 +798,8 @@ nvptx_open_device (int n)
return NULL;
}
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
async_engines = 1;
@ -746,7 +846,9 @@ nvptx_get_num_devices (void)
further initialization). */
if (instantiated_devices == 0)
{
CUresult r = cuInit (0);
if (!init_cuda_lib ())
return 0;
CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
/* This is not an error: e.g. we may have CUDA libraries installed but
no devices available. */
if (r != CUDA_SUCCESS)
@ -797,8 +899,9 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
/* cuLinkAddData's 'data' argument erroneously omits the const
qualifier. */
GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
ptx_objs->size, 0, 0, 0, 0);
r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
(char *) ptx_objs->code, ptx_objs->size,
0, 0, 0, 0);
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
@ -809,7 +912,7 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
}
GOMP_PLUGIN_debug (0, "Linking\n");
r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
@ -844,7 +947,7 @@ event_gc (bool memmap_lockable)
if (e->ord != nvthd->ptx_dev->ord)
continue;
r = cuEventQuery (*e->evt);
r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
if (r == CUDA_SUCCESS)
{
bool append_async = false;
@ -877,7 +980,7 @@ event_gc (bool memmap_lockable)
break;
}
cuEventDestroy (*te);
CUDA_CALL_NOCHECK (cuEventDestroy, *te);
free ((void *)te);
/* Unlink 'e' from ptx_events list. */
@ -1015,10 +1118,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
&& cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
&& cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
&& cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS)
if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
dev) == CUDA_SUCCESS
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
dev) == CUDA_SUCCESS
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
dev) == CUDA_SUCCESS
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
dev) == CUDA_SUCCESS)
{
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
" dev_size=%d, cpu_size=%d\n",
@ -1090,7 +1197,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
#ifndef DISABLE_ASYNC
if (async < acc_async_noval)
{
r = cuStreamSynchronize (dev_str->stream);
r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@ -1103,7 +1210,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@ -1117,7 +1224,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
}
#else
r = cuCtxSynchronize ();
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@ -1294,7 +1401,7 @@ nvptx_async_test (int async)
if (!s)
GOMP_PLUGIN_fatal ("unknown async %d", async);
r = cuStreamQuery (s->stream);
r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
if (r == CUDA_SUCCESS)
{
/* The oacc-parallel.c:goacc_wait function calls this hook to determine
@ -1325,7 +1432,8 @@ nvptx_async_test_all (void)
for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
{
if ((s->multithreaded || pthread_equal (s->host_thread, self))
&& cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
&& CUDA_CALL_NOCHECK (cuStreamQuery,
s->stream) == CUDA_ERROR_NOT_READY)
{
pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
return 0;
@ -1400,7 +1508,7 @@ nvptx_wait_all (void)
{
if (s->multithreaded || pthread_equal (s->host_thread, self))
{
r = cuStreamQuery (s->stream);
r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
if (r == CUDA_SUCCESS)
continue;
else if (r != CUDA_ERROR_NOT_READY)
@ -1632,13 +1740,15 @@ static void
nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
{
CUdeviceptr dptr;
CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
module, "__nvptx_clocktick");
if (r == CUDA_ERROR_NOT_FOUND)
return;
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
double __nvptx_clocktick = 1e-3 / dev->clock_khz;
r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
sizeof (__nvptx_clocktick));
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
@ -1761,7 +1871,7 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
if (image->target_data == target_data)
{
*prev_p = image->next;
if (cuModuleUnload (image->module) != CUDA_SUCCESS)
if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
ret = false;
free (image->fns);
free (image);
@ -1974,7 +2084,7 @@ static void *
nvptx_stacks_alloc (size_t size, int num)
{
CUdeviceptr stacks;
CUresult r = cuMemAlloc (&stacks, size * num);
CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
return (void *) stacks;
@ -1985,7 +2095,7 @@ nvptx_stacks_alloc (size_t size, int num)
static void
nvptx_stacks_free (void *p, int num)
{
CUresult r = cuMemFree ((CUdeviceptr) p);
CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
}
@ -2028,14 +2138,13 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
CU_LAUNCH_PARAM_END
};
r = cuLaunchKernel (function,
teams, 1, 1,
32, threads, 1,
0, ptx_dev->null_stream->stream, NULL, config);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
32, threads, 1, 0, ptx_dev->null_stream->stream,
NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
r = cuCtxSynchronize ();
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);