configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB.
* plugin/configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use plugin/include/cuda as include dir and -ldl instead of -lcuda as library to link ptx plugin against. * plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC. (CUDA_CALLS): Define. (cuda_lib, cuda_lib_inited): New variables. (init_cuda_lib): New function. (CUDA_CALL_PREFIX): Define. (CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX. (CUDA_CALL): Use FN instead of (FN). (CUDA_CALL_NOCHECK): Define. (cuda_error, fini_streams_for_device, select_stream_for_async, nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx, event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all, nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image, nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use CUDA_CALL_NOCHECK. (nvptx_init): Call init_cuda_lib, if it fails, return false. Use CUDA_CALL_NOCHECK. (nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0. Use CUDA_CALL_NOCHECK. * plugin/cuda/cuda.h: New file. * config.h.in: Regenerated. * configure: Regenerated. From-SVN: r244522
This commit is contained in:
parent
3c36aa6ba2
commit
2393d337e7
|
@ -1,5 +1,33 @@
|
||||||
2017-01-17 Jakub Jelinek <jakub@redhat.com>
|
2017-01-17 Jakub Jelinek <jakub@redhat.com>
|
||||||
|
|
||||||
|
* plugin/configfrag.ac: For --without-cuda-driver don't initialize
|
||||||
|
CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both
|
||||||
|
CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small
|
||||||
|
cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use
|
||||||
|
plugin/include/cuda as include dir and -ldl instead of -lcuda as
|
||||||
|
library to link ptx plugin against.
|
||||||
|
* plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC.
|
||||||
|
(CUDA_CALLS): Define.
|
||||||
|
(cuda_lib, cuda_lib_inited): New variables.
|
||||||
|
(init_cuda_lib): New function.
|
||||||
|
(CUDA_CALL_PREFIX): Define.
|
||||||
|
(CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX.
|
||||||
|
(CUDA_CALL): Use FN instead of (FN).
|
||||||
|
(CUDA_CALL_NOCHECK): Define.
|
||||||
|
(cuda_error, fini_streams_for_device, select_stream_for_async,
|
||||||
|
nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx,
|
||||||
|
event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all,
|
||||||
|
nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image,
|
||||||
|
nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use
|
||||||
|
CUDA_CALL_NOCHECK.
|
||||||
|
(nvptx_init): Call init_cuda_lib, if it fails, return false. Use
|
||||||
|
CUDA_CALL_NOCHECK.
|
||||||
|
(nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0.
|
||||||
|
Use CUDA_CALL_NOCHECK.
|
||||||
|
* plugin/cuda/cuda.h: New file.
|
||||||
|
* config.h.in: Regenerated.
|
||||||
|
* configure: Regenerated.
|
||||||
|
|
||||||
PR other/79046
|
PR other/79046
|
||||||
* configure.ac: Add GCC_BASE_VER.
|
* configure.ac: Add GCC_BASE_VER.
|
||||||
* Makefile.am (gcc_version): Use @get_gcc_base_ver@ instead of cat to
|
* Makefile.am (gcc_version): Use @get_gcc_base_ver@ instead of cat to
|
||||||
|
|
|
@ -155,6 +155,10 @@
|
||||||
/* Define to 1 if the NVIDIA plugin is built, 0 if not. */
|
/* Define to 1 if the NVIDIA plugin is built, 0 if not. */
|
||||||
#undef PLUGIN_NVPTX
|
#undef PLUGIN_NVPTX
|
||||||
|
|
||||||
|
/* Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should
|
||||||
|
be linked against it. */
|
||||||
|
#undef PLUGIN_NVPTX_DYNAMIC
|
||||||
|
|
||||||
/* Define if all infrastructure, needed for plugins, is supported. */
|
/* Define if all infrastructure, needed for plugins, is supported. */
|
||||||
#undef PLUGIN_SUPPORT
|
#undef PLUGIN_SUPPORT
|
||||||
|
|
||||||
|
|
|
@ -15299,10 +15299,12 @@ if test "${with_cuda_driver_lib+set}" = set; then :
|
||||||
withval=$with_cuda_driver_lib;
|
withval=$with_cuda_driver_lib;
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if test "x$with_cuda_driver" != x; then
|
case "x$with_cuda_driver" in
|
||||||
CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
x | xno) ;;
|
||||||
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
||||||
fi
|
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
||||||
|
;;
|
||||||
|
esac
|
||||||
if test "x$with_cuda_driver_include" != x; then
|
if test "x$with_cuda_driver_include" != x; then
|
||||||
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
|
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
|
||||||
fi
|
fi
|
||||||
|
@ -15320,6 +15322,7 @@ PLUGIN_NVPTX=0
|
||||||
PLUGIN_NVPTX_CPPFLAGS=
|
PLUGIN_NVPTX_CPPFLAGS=
|
||||||
PLUGIN_NVPTX_LDFLAGS=
|
PLUGIN_NVPTX_LDFLAGS=
|
||||||
PLUGIN_NVPTX_LIBS=
|
PLUGIN_NVPTX_LIBS=
|
||||||
|
PLUGIN_NVPTX_DYNAMIC=0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -15426,9 +15429,17 @@ rm -f core conftest.err conftest.$ac_objext \
|
||||||
LIBS=$PLUGIN_NVPTX_save_LIBS
|
LIBS=$PLUGIN_NVPTX_save_LIBS
|
||||||
case $PLUGIN_NVPTX in
|
case $PLUGIN_NVPTX in
|
||||||
nvptx*)
|
nvptx*)
|
||||||
PLUGIN_NVPTX=0
|
if test "x$CUDA_DRIVER_INCLUDE" = x \
|
||||||
as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5
|
&& test "x$CUDA_DRIVER_LIB" = x; then
|
||||||
;;
|
PLUGIN_NVPTX=1
|
||||||
|
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
|
||||||
|
PLUGIN_NVPTX_LIBS='-ldl'
|
||||||
|
PLUGIN_NVPTX_DYNAMIC=1
|
||||||
|
else
|
||||||
|
PLUGIN_NVPTX=0
|
||||||
|
as_fn_error "CUDA driver package required for nvptx support" "$LINENO" 5
|
||||||
|
fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
hsa*)
|
hsa*)
|
||||||
|
@ -15513,6 +15524,11 @@ cat >>confdefs.h <<_ACEOF
|
||||||
#define PLUGIN_NVPTX $PLUGIN_NVPTX
|
#define PLUGIN_NVPTX $PLUGIN_NVPTX
|
||||||
_ACEOF
|
_ACEOF
|
||||||
|
|
||||||
|
|
||||||
|
cat >>confdefs.h <<_ACEOF
|
||||||
|
#define PLUGIN_NVPTX_DYNAMIC $PLUGIN_NVPTX_DYNAMIC
|
||||||
|
_ACEOF
|
||||||
|
|
||||||
if test $PLUGIN_HSA = 1; then
|
if test $PLUGIN_HSA = 1; then
|
||||||
PLUGIN_HSA_TRUE=
|
PLUGIN_HSA_TRUE=
|
||||||
PLUGIN_HSA_FALSE='#'
|
PLUGIN_HSA_FALSE='#'
|
||||||
|
|
|
@ -58,10 +58,12 @@ AC_ARG_WITH(cuda-driver-include,
|
||||||
AC_ARG_WITH(cuda-driver-lib,
|
AC_ARG_WITH(cuda-driver-lib,
|
||||||
[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
|
[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
|
||||||
[specify directory for the installed CUDA driver library])])
|
[specify directory for the installed CUDA driver library])])
|
||||||
if test "x$with_cuda_driver" != x; then
|
case "x$with_cuda_driver" in
|
||||||
CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
x | xno) ;;
|
||||||
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
||||||
fi
|
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
||||||
|
;;
|
||||||
|
esac
|
||||||
if test "x$with_cuda_driver_include" != x; then
|
if test "x$with_cuda_driver_include" != x; then
|
||||||
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
|
CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
|
||||||
fi
|
fi
|
||||||
|
@ -79,6 +81,7 @@ PLUGIN_NVPTX=0
|
||||||
PLUGIN_NVPTX_CPPFLAGS=
|
PLUGIN_NVPTX_CPPFLAGS=
|
||||||
PLUGIN_NVPTX_LDFLAGS=
|
PLUGIN_NVPTX_LDFLAGS=
|
||||||
PLUGIN_NVPTX_LIBS=
|
PLUGIN_NVPTX_LIBS=
|
||||||
|
PLUGIN_NVPTX_DYNAMIC=0
|
||||||
AC_SUBST(PLUGIN_NVPTX)
|
AC_SUBST(PLUGIN_NVPTX)
|
||||||
AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
|
AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
|
||||||
AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
|
AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
|
||||||
|
@ -167,9 +170,17 @@ if test x"$enable_offload_targets" != x; then
|
||||||
LIBS=$PLUGIN_NVPTX_save_LIBS
|
LIBS=$PLUGIN_NVPTX_save_LIBS
|
||||||
case $PLUGIN_NVPTX in
|
case $PLUGIN_NVPTX in
|
||||||
nvptx*)
|
nvptx*)
|
||||||
PLUGIN_NVPTX=0
|
if test "x$CUDA_DRIVER_INCLUDE" = x \
|
||||||
AC_MSG_ERROR([CUDA driver package required for nvptx support])
|
&& test "x$CUDA_DRIVER_LIB" = x; then
|
||||||
;;
|
PLUGIN_NVPTX=1
|
||||||
|
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
|
||||||
|
PLUGIN_NVPTX_LIBS='-ldl'
|
||||||
|
PLUGIN_NVPTX_DYNAMIC=1
|
||||||
|
else
|
||||||
|
PLUGIN_NVPTX=0
|
||||||
|
AC_MSG_ERROR([CUDA driver package required for nvptx support])
|
||||||
|
fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
hsa*)
|
hsa*)
|
||||||
|
@ -241,6 +252,8 @@ AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
|
||||||
AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
|
AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
|
||||||
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
|
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
|
||||||
[Define to 1 if the NVIDIA plugin is built, 0 if not.])
|
[Define to 1 if the NVIDIA plugin is built, 0 if not.])
|
||||||
|
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC],
|
||||||
|
[Define to 1 if the NVIDIA plugin should dlopen libcuda.so.1, 0 if it should be linked against it.])
|
||||||
AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
|
AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
|
||||||
AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
|
AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
|
||||||
[Define to 1 if the HSA plugin is built, 0 if not.])
|
[Define to 1 if the HSA plugin is built, 0 if not.])
|
||||||
|
|
|
@ -0,0 +1,179 @@
|
||||||
|
/* CUDA API description.
|
||||||
|
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of GCC.
|
||||||
|
|
||||||
|
GCC is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3, or (at your option)
|
||||||
|
any later version.
|
||||||
|
|
||||||
|
GCC is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
Under Section 7 of GPL version 3, you are granted additional
|
||||||
|
permissions described in the GCC Runtime Library Exception, version
|
||||||
|
3.1, as published by the Free Software Foundation.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License and
|
||||||
|
a copy of the GCC Runtime Library Exception along with this program;
|
||||||
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||||
|
<http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
This header provides the minimum amount of typedefs, enums and function
|
||||||
|
declarations to be able to compile plugin-nvptx.c if cuda.h and
|
||||||
|
libcuda.so.1 are not available. */
|
||||||
|
|
||||||
|
#ifndef GCC_CUDA_H
|
||||||
|
#define GCC_CUDA_H
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define CUDA_VERSION 8000
|
||||||
|
|
||||||
|
typedef void *CUcontext;
|
||||||
|
typedef int CUdevice;
|
||||||
|
#ifdef __LP64__
|
||||||
|
typedef unsigned long long CUdeviceptr;
|
||||||
|
#else
|
||||||
|
typedef unsigned CUdeviceptr;
|
||||||
|
#endif
|
||||||
|
typedef void *CUevent;
|
||||||
|
typedef void *CUfunction;
|
||||||
|
typedef void *CUlinkState;
|
||||||
|
typedef void *CUmodule;
|
||||||
|
typedef void *CUstream;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
CUDA_SUCCESS = 0,
|
||||||
|
CUDA_ERROR_INVALID_VALUE = 1,
|
||||||
|
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
||||||
|
CUDA_ERROR_INVALID_CONTEXT = 201,
|
||||||
|
CUDA_ERROR_NOT_FOUND = 500,
|
||||||
|
CUDA_ERROR_NOT_READY = 600,
|
||||||
|
CUDA_ERROR_LAUNCH_FAILED = 719
|
||||||
|
} CUresult;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
|
||||||
|
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
|
||||||
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
|
||||||
|
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
|
||||||
|
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
|
||||||
|
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
|
||||||
|
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
|
||||||
|
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
|
||||||
|
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
|
||||||
|
} CUdevice_attribute;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
CU_EVENT_DEFAULT = 0,
|
||||||
|
CU_EVENT_DISABLE_TIMING = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
|
||||||
|
CU_FUNC_ATTRIBUTE_NUM_REGS = 4
|
||||||
|
} CUfunction_attribute;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
CU_JIT_WALL_TIME = 2,
|
||||||
|
CU_JIT_INFO_LOG_BUFFER = 3,
|
||||||
|
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
|
||||||
|
CU_JIT_ERROR_LOG_BUFFER = 5,
|
||||||
|
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
|
||||||
|
CU_JIT_LOG_VERBOSE = 12
|
||||||
|
} CUjit_option;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
CU_JIT_INPUT_PTX = 1
|
||||||
|
} CUjitInputType;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
CU_CTX_SCHED_AUTO = 0
|
||||||
|
};
|
||||||
|
|
||||||
|
#define CU_LAUNCH_PARAM_END ((void *) 0)
|
||||||
|
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
|
||||||
|
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
|
||||||
|
|
||||||
|
enum {
|
||||||
|
CU_STREAM_DEFAULT = 0,
|
||||||
|
CU_STREAM_NON_BLOCKING = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
#define cuCtxCreate cuCtxCreate_v2
|
||||||
|
CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
|
||||||
|
#define cuCtxDestroy cuCtxDestroy_v2
|
||||||
|
CUresult cuCtxDestroy (CUcontext);
|
||||||
|
CUresult cuCtxGetCurrent (CUcontext *);
|
||||||
|
CUresult cuCtxGetDevice (CUdevice *);
|
||||||
|
#define cuCtxPopCurrent cuCtxPopCurrent_v2
|
||||||
|
CUresult cuCtxPopCurrent (CUcontext *);
|
||||||
|
#define cuCtxPushCurrent cuCtxPushCurrent_v2
|
||||||
|
CUresult cuCtxPushCurrent (CUcontext);
|
||||||
|
CUresult cuCtxSynchronize (void);
|
||||||
|
CUresult cuDeviceGet (CUdevice *, int);
|
||||||
|
CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
|
||||||
|
CUresult cuDeviceGetCount (int *);
|
||||||
|
CUresult cuEventCreate (CUevent *, unsigned);
|
||||||
|
#define cuEventDestroy cuEventDestroy_v2
|
||||||
|
CUresult cuEventDestroy (CUevent);
|
||||||
|
CUresult cuEventElapsedTime (float *, CUevent, CUevent);
|
||||||
|
CUresult cuEventQuery (CUevent);
|
||||||
|
CUresult cuEventRecord (CUevent, CUstream);
|
||||||
|
CUresult cuEventSynchronize (CUevent);
|
||||||
|
CUresult cuFuncGetAttribute (int *, CUfunction_attribute, CUfunction);
|
||||||
|
CUresult cuGetErrorString (CUresult, const char **);
|
||||||
|
CUresult cuInit (unsigned);
|
||||||
|
CUresult cuLaunchKernel (CUfunction, unsigned, unsigned, unsigned, unsigned,
|
||||||
|
unsigned, unsigned, unsigned, CUstream, void **, void **);
|
||||||
|
#define cuLinkAddData cuLinkAddData_v2
|
||||||
|
CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *,
|
||||||
|
unsigned, CUjit_option *, void **);
|
||||||
|
CUresult cuLinkComplete (CUlinkState, void **, size_t *);
|
||||||
|
#define cuLinkCreate cuLinkCreate_v2
|
||||||
|
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
|
||||||
|
CUresult cuLinkDestroy (CUlinkState);
|
||||||
|
#define cuMemAlloc cuMemAlloc_v2
|
||||||
|
CUresult cuMemAlloc (CUdeviceptr *, size_t);
|
||||||
|
#define cuMemAllocHost cuMemAllocHost_v2
|
||||||
|
CUresult cuMemAllocHost (void **, size_t);
|
||||||
|
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
|
||||||
|
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
|
||||||
|
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
|
||||||
|
#define cuMemcpyDtoH cuMemcpyDtoH_v2
|
||||||
|
CUresult cuMemcpyDtoH (void *, CUdeviceptr, size_t);
|
||||||
|
#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
|
||||||
|
CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
|
||||||
|
#define cuMemcpyHtoD cuMemcpyHtoD_v2
|
||||||
|
CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
|
||||||
|
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
|
||||||
|
CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
|
||||||
|
#define cuMemFree cuMemFree_v2
|
||||||
|
CUresult cuMemFree (CUdeviceptr);
|
||||||
|
CUresult cuMemFreeHost (void *);
|
||||||
|
#define cuMemGetAddressRange cuMemGetAddressRange_v2
|
||||||
|
CUresult cuMemGetAddressRange (CUdeviceptr *, size_t *, CUdeviceptr);
|
||||||
|
#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
|
||||||
|
CUresult cuMemHostGetDevicePointer (CUdeviceptr *, void *, unsigned);
|
||||||
|
CUresult cuModuleGetFunction (CUfunction *, CUmodule, const char *);
|
||||||
|
#define cuModuleGetGlobal cuModuleGetGlobal_v2
|
||||||
|
CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
|
||||||
|
CUresult cuModuleLoad (CUmodule *, const char *);
|
||||||
|
CUresult cuModuleLoadData (CUmodule *, const void *);
|
||||||
|
CUresult cuModuleUnload (CUmodule);
|
||||||
|
CUresult cuStreamCreate (CUstream *, unsigned);
|
||||||
|
#define cuStreamDestroy cuStreamDestroy_v2
|
||||||
|
CUresult cuStreamDestroy (CUstream);
|
||||||
|
CUresult cuStreamQuery (CUstream);
|
||||||
|
CUresult cuStreamSynchronize (CUstream);
|
||||||
|
CUresult cuStreamWaitEvent (CUstream, CUevent, unsigned);
|
||||||
|
|
||||||
|
#endif /* GCC_CUDA_H */
|
|
@ -48,6 +48,129 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
|
#if PLUGIN_NVPTX_DYNAMIC
|
||||||
|
# include <dlfcn.h>
|
||||||
|
|
||||||
|
# define CUDA_CALLS \
|
||||||
|
CUDA_ONE_CALL (cuCtxCreate) \
|
||||||
|
CUDA_ONE_CALL (cuCtxDestroy) \
|
||||||
|
CUDA_ONE_CALL (cuCtxGetCurrent) \
|
||||||
|
CUDA_ONE_CALL (cuCtxGetDevice) \
|
||||||
|
CUDA_ONE_CALL (cuCtxPopCurrent) \
|
||||||
|
CUDA_ONE_CALL (cuCtxPushCurrent) \
|
||||||
|
CUDA_ONE_CALL (cuCtxSynchronize) \
|
||||||
|
CUDA_ONE_CALL (cuDeviceGet) \
|
||||||
|
CUDA_ONE_CALL (cuDeviceGetAttribute) \
|
||||||
|
CUDA_ONE_CALL (cuDeviceGetCount) \
|
||||||
|
CUDA_ONE_CALL (cuEventCreate) \
|
||||||
|
CUDA_ONE_CALL (cuEventDestroy) \
|
||||||
|
CUDA_ONE_CALL (cuEventElapsedTime) \
|
||||||
|
CUDA_ONE_CALL (cuEventQuery) \
|
||||||
|
CUDA_ONE_CALL (cuEventRecord) \
|
||||||
|
CUDA_ONE_CALL (cuEventSynchronize) \
|
||||||
|
CUDA_ONE_CALL (cuFuncGetAttribute) \
|
||||||
|
CUDA_ONE_CALL (cuGetErrorString) \
|
||||||
|
CUDA_ONE_CALL (cuInit) \
|
||||||
|
CUDA_ONE_CALL (cuLaunchKernel) \
|
||||||
|
CUDA_ONE_CALL (cuLinkAddData) \
|
||||||
|
CUDA_ONE_CALL (cuLinkComplete) \
|
||||||
|
CUDA_ONE_CALL (cuLinkCreate) \
|
||||||
|
CUDA_ONE_CALL (cuLinkDestroy) \
|
||||||
|
CUDA_ONE_CALL (cuMemAlloc) \
|
||||||
|
CUDA_ONE_CALL (cuMemAllocHost) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpy) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpyDtoH) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpyHtoD) \
|
||||||
|
CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
|
||||||
|
CUDA_ONE_CALL (cuMemFree) \
|
||||||
|
CUDA_ONE_CALL (cuMemFreeHost) \
|
||||||
|
CUDA_ONE_CALL (cuMemGetAddressRange) \
|
||||||
|
CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
|
||||||
|
CUDA_ONE_CALL (cuModuleGetFunction) \
|
||||||
|
CUDA_ONE_CALL (cuModuleGetGlobal) \
|
||||||
|
CUDA_ONE_CALL (cuModuleLoad) \
|
||||||
|
CUDA_ONE_CALL (cuModuleLoadData) \
|
||||||
|
CUDA_ONE_CALL (cuModuleUnload) \
|
||||||
|
CUDA_ONE_CALL (cuStreamCreate) \
|
||||||
|
CUDA_ONE_CALL (cuStreamDestroy) \
|
||||||
|
CUDA_ONE_CALL (cuStreamQuery) \
|
||||||
|
CUDA_ONE_CALL (cuStreamSynchronize) \
|
||||||
|
CUDA_ONE_CALL (cuStreamWaitEvent)
|
||||||
|
# define CUDA_ONE_CALL(call) \
|
||||||
|
__typeof (call) *call;
|
||||||
|
struct cuda_lib_s {
|
||||||
|
CUDA_CALLS
|
||||||
|
} cuda_lib;
|
||||||
|
|
||||||
|
/* -1 if init_cuda_lib has not been called yet, false
|
||||||
|
if it has been and failed, true if it has been and succeeded. */
|
||||||
|
static char cuda_lib_inited = -1;
|
||||||
|
|
||||||
|
/* Dynamically load the CUDA runtime library and initialize function
|
||||||
|
pointers, return false if unsuccessful, true if successful. */
|
||||||
|
static bool
|
||||||
|
init_cuda_lib (void)
|
||||||
|
{
|
||||||
|
if (cuda_lib_inited != -1)
|
||||||
|
return cuda_lib_inited;
|
||||||
|
const char *cuda_runtime_lib = "libcuda.so.1";
|
||||||
|
void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
|
||||||
|
cuda_lib_inited = false;
|
||||||
|
if (h == NULL)
|
||||||
|
return false;
|
||||||
|
# undef CUDA_ONE_CALL
|
||||||
|
# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
|
||||||
|
# define CUDA_ONE_CALL_1(call) \
|
||||||
|
cuda_lib.call = dlsym (h, #call); \
|
||||||
|
if (cuda_lib.call == NULL) \
|
||||||
|
return false;
|
||||||
|
CUDA_CALLS
|
||||||
|
cuda_lib_inited = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
# undef CUDA_ONE_CALL
|
||||||
|
# undef CUDA_ONE_CALL_1
|
||||||
|
# define CUDA_CALL_PREFIX cuda_lib.
|
||||||
|
#else
|
||||||
|
# define CUDA_CALL_PREFIX
|
||||||
|
# define init_cuda_lib() true
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Convenience macros for the frequently used CUDA library call and
|
||||||
|
error handling sequence as well as CUDA library calls that
|
||||||
|
do the error checking themselves or don't do it at all. */
|
||||||
|
|
||||||
|
#define CUDA_CALL_ERET(ERET, FN, ...) \
|
||||||
|
do { \
|
||||||
|
unsigned __r \
|
||||||
|
= CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
||||||
|
if (__r != CUDA_SUCCESS) \
|
||||||
|
{ \
|
||||||
|
GOMP_PLUGIN_error (#FN " error: %s", \
|
||||||
|
cuda_error (__r)); \
|
||||||
|
return ERET; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define CUDA_CALL(FN, ...) \
|
||||||
|
CUDA_CALL_ERET (false, FN, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define CUDA_CALL_ASSERT(FN, ...) \
|
||||||
|
do { \
|
||||||
|
unsigned __r \
|
||||||
|
= CUDA_CALL_PREFIX FN (__VA_ARGS__); \
|
||||||
|
if (__r != CUDA_SUCCESS) \
|
||||||
|
{ \
|
||||||
|
GOMP_PLUGIN_fatal (#FN " error: %s", \
|
||||||
|
cuda_error (__r)); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define CUDA_CALL_NOCHECK(FN, ...) \
|
||||||
|
CUDA_CALL_PREFIX FN (__VA_ARGS__)
|
||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
cuda_error (CUresult r)
|
cuda_error (CUresult r)
|
||||||
{
|
{
|
||||||
|
@ -58,41 +181,13 @@ cuda_error (CUresult r)
|
||||||
#endif
|
#endif
|
||||||
const char *desc;
|
const char *desc;
|
||||||
|
|
||||||
r = cuGetErrorString (r, &desc);
|
r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
desc = "unknown cuda error";
|
desc = "unknown cuda error";
|
||||||
|
|
||||||
return desc;
|
return desc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convenience macros for the frequently used CUDA library call and
|
|
||||||
error handling sequence. This does not capture all the cases we
|
|
||||||
use in this file, but is common enough. */
|
|
||||||
|
|
||||||
#define CUDA_CALL_ERET(ERET, FN, ...) \
|
|
||||||
do { \
|
|
||||||
unsigned __r = FN (__VA_ARGS__); \
|
|
||||||
if (__r != CUDA_SUCCESS) \
|
|
||||||
{ \
|
|
||||||
GOMP_PLUGIN_error (#FN " error: %s", \
|
|
||||||
cuda_error (__r)); \
|
|
||||||
return ERET; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define CUDA_CALL(FN, ...) \
|
|
||||||
CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
|
|
||||||
|
|
||||||
#define CUDA_CALL_ASSERT(FN, ...) \
|
|
||||||
do { \
|
|
||||||
unsigned __r = FN (__VA_ARGS__); \
|
|
||||||
if (__r != CUDA_SUCCESS) \
|
|
||||||
{ \
|
|
||||||
GOMP_PLUGIN_fatal (#FN " error: %s", \
|
|
||||||
cuda_error (__r)); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
static unsigned int instantiated_devices = 0;
|
static unsigned int instantiated_devices = 0;
|
||||||
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
@ -401,7 +496,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
|
||||||
|
|
||||||
ret &= map_fini (s);
|
ret &= map_fini (s);
|
||||||
|
|
||||||
CUresult r = cuStreamDestroy (s->stream);
|
CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
|
GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
|
||||||
|
@ -484,7 +579,8 @@ select_stream_for_async (int async, pthread_t thread, bool create,
|
||||||
s->stream = existing;
|
s->stream = existing;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
|
r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
|
||||||
|
CU_STREAM_DEFAULT);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
pthread_mutex_unlock (&ptx_dev->stream_lock);
|
pthread_mutex_unlock (&ptx_dev->stream_lock);
|
||||||
|
@ -554,10 +650,14 @@ nvptx_init (void)
|
||||||
if (instantiated_devices != 0)
|
if (instantiated_devices != 0)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
CUDA_CALL (cuInit, 0);
|
|
||||||
ptx_events = NULL;
|
ptx_events = NULL;
|
||||||
pthread_mutex_init (&ptx_event_lock, NULL);
|
pthread_mutex_init (&ptx_event_lock, NULL);
|
||||||
|
|
||||||
|
if (!init_cuda_lib ())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
CUDA_CALL (cuInit, 0);
|
||||||
|
|
||||||
CUDA_CALL (cuDeviceGetCount, &ndevs);
|
CUDA_CALL (cuDeviceGetCount, &ndevs);
|
||||||
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
|
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
|
||||||
* ndevs);
|
* ndevs);
|
||||||
|
@ -575,7 +675,7 @@ nvptx_attach_host_thread_to_device (int n)
|
||||||
struct ptx_device *ptx_dev;
|
struct ptx_device *ptx_dev;
|
||||||
CUcontext thd_ctx;
|
CUcontext thd_ctx;
|
||||||
|
|
||||||
r = cuCtxGetDevice (&dev);
|
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
|
||||||
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
||||||
{
|
{
|
||||||
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
||||||
|
@ -623,7 +723,7 @@ nvptx_open_device (int n)
|
||||||
ptx_dev->dev = dev;
|
ptx_dev->dev = dev;
|
||||||
ptx_dev->ctx_shared = false;
|
ptx_dev->ctx_shared = false;
|
||||||
|
|
||||||
r = cuCtxGetDevice (&ctx_dev);
|
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
|
||||||
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
|
||||||
{
|
{
|
||||||
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
|
||||||
|
@ -669,7 +769,7 @@ nvptx_open_device (int n)
|
||||||
&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
||||||
ptx_dev->clock_khz = pi;
|
ptx_dev->clock_khz = pi;
|
||||||
|
|
||||||
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
|
||||||
&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
||||||
ptx_dev->num_sms = pi;
|
ptx_dev->num_sms = pi;
|
||||||
|
|
||||||
|
@ -679,7 +779,7 @@ nvptx_open_device (int n)
|
||||||
|
|
||||||
/* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
|
/* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
|
||||||
in CUDA 6.0 and newer. */
|
in CUDA 6.0 and newer. */
|
||||||
r = cuDeviceGetAttribute (&pi, 82, dev);
|
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
|
||||||
/* Fallback: use limit of registers per block, which is usually equal. */
|
/* Fallback: use limit of registers per block, which is usually equal. */
|
||||||
if (r == CUDA_ERROR_INVALID_VALUE)
|
if (r == CUDA_ERROR_INVALID_VALUE)
|
||||||
pi = ptx_dev->regs_per_block;
|
pi = ptx_dev->regs_per_block;
|
||||||
|
@ -698,8 +798,8 @@ nvptx_open_device (int n)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
r = cuDeviceGetAttribute (&async_engines,
|
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
|
||||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
async_engines = 1;
|
async_engines = 1;
|
||||||
|
|
||||||
|
@ -746,7 +846,9 @@ nvptx_get_num_devices (void)
|
||||||
further initialization). */
|
further initialization). */
|
||||||
if (instantiated_devices == 0)
|
if (instantiated_devices == 0)
|
||||||
{
|
{
|
||||||
CUresult r = cuInit (0);
|
if (!init_cuda_lib ())
|
||||||
|
return 0;
|
||||||
|
CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
|
||||||
/* This is not an error: e.g. we may have CUDA libraries installed but
|
/* This is not an error: e.g. we may have CUDA libraries installed but
|
||||||
no devices available. */
|
no devices available. */
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
|
@ -797,8 +899,9 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
|
||||||
/* cuLinkAddData's 'data' argument erroneously omits the const
|
/* cuLinkAddData's 'data' argument erroneously omits the const
|
||||||
qualifier. */
|
qualifier. */
|
||||||
GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
|
GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
|
||||||
r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
|
r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
|
||||||
ptx_objs->size, 0, 0, 0, 0);
|
(char *) ptx_objs->code, ptx_objs->size,
|
||||||
|
0, 0, 0, 0);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
|
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
|
||||||
|
@ -809,7 +912,7 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
|
||||||
}
|
}
|
||||||
|
|
||||||
GOMP_PLUGIN_debug (0, "Linking\n");
|
GOMP_PLUGIN_debug (0, "Linking\n");
|
||||||
r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
|
r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
|
||||||
|
|
||||||
GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
|
GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
|
||||||
GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
|
GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
|
||||||
|
@ -844,7 +947,7 @@ event_gc (bool memmap_lockable)
|
||||||
if (e->ord != nvthd->ptx_dev->ord)
|
if (e->ord != nvthd->ptx_dev->ord)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
r = cuEventQuery (*e->evt);
|
r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
|
||||||
if (r == CUDA_SUCCESS)
|
if (r == CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
bool append_async = false;
|
bool append_async = false;
|
||||||
|
@ -877,7 +980,7 @@ event_gc (bool memmap_lockable)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
cuEventDestroy (*te);
|
CUDA_CALL_NOCHECK (cuEventDestroy, *te);
|
||||||
free ((void *)te);
|
free ((void *)te);
|
||||||
|
|
||||||
/* Unlink 'e' from ptx_events list. */
|
/* Unlink 'e' from ptx_events list. */
|
||||||
|
@ -1015,10 +1118,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
|
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
|
||||||
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
|
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
|
||||||
|
|
||||||
if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
|
if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
|
||||||
&& cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
|
dev) == CUDA_SUCCESS
|
||||||
&& cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
|
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
|
||||||
&& cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS)
|
dev) == CUDA_SUCCESS
|
||||||
|
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
|
||||||
|
dev) == CUDA_SUCCESS
|
||||||
|
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
|
||||||
|
dev) == CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
||||||
" dev_size=%d, cpu_size=%d\n",
|
" dev_size=%d, cpu_size=%d\n",
|
||||||
|
@ -1090,7 +1197,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
#ifndef DISABLE_ASYNC
|
#ifndef DISABLE_ASYNC
|
||||||
if (async < acc_async_noval)
|
if (async < acc_async_noval)
|
||||||
{
|
{
|
||||||
r = cuStreamSynchronize (dev_str->stream);
|
r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
|
||||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||||
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
|
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
|
||||||
maybe_abort_msg);
|
maybe_abort_msg);
|
||||||
|
@ -1103,7 +1210,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
|
|
||||||
e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
|
||||||
|
|
||||||
r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
|
r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
|
||||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||||
GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
|
GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
|
||||||
maybe_abort_msg);
|
maybe_abort_msg);
|
||||||
|
@ -1117,7 +1224,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||||
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
r = cuCtxSynchronize ();
|
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
||||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||||
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
||||||
maybe_abort_msg);
|
maybe_abort_msg);
|
||||||
|
@ -1294,7 +1401,7 @@ nvptx_async_test (int async)
|
||||||
if (!s)
|
if (!s)
|
||||||
GOMP_PLUGIN_fatal ("unknown async %d", async);
|
GOMP_PLUGIN_fatal ("unknown async %d", async);
|
||||||
|
|
||||||
r = cuStreamQuery (s->stream);
|
r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
||||||
if (r == CUDA_SUCCESS)
|
if (r == CUDA_SUCCESS)
|
||||||
{
|
{
|
||||||
/* The oacc-parallel.c:goacc_wait function calls this hook to determine
|
/* The oacc-parallel.c:goacc_wait function calls this hook to determine
|
||||||
|
@ -1325,7 +1432,8 @@ nvptx_async_test_all (void)
|
||||||
for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
|
||||||
{
|
{
|
||||||
if ((s->multithreaded || pthread_equal (s->host_thread, self))
|
if ((s->multithreaded || pthread_equal (s->host_thread, self))
|
||||||
&& cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
|
&& CUDA_CALL_NOCHECK (cuStreamQuery,
|
||||||
|
s->stream) == CUDA_ERROR_NOT_READY)
|
||||||
{
|
{
|
||||||
pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1400,7 +1508,7 @@ nvptx_wait_all (void)
|
||||||
{
|
{
|
||||||
if (s->multithreaded || pthread_equal (s->host_thread, self))
|
if (s->multithreaded || pthread_equal (s->host_thread, self))
|
||||||
{
|
{
|
||||||
r = cuStreamQuery (s->stream);
|
r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
||||||
if (r == CUDA_SUCCESS)
|
if (r == CUDA_SUCCESS)
|
||||||
continue;
|
continue;
|
||||||
else if (r != CUDA_ERROR_NOT_READY)
|
else if (r != CUDA_ERROR_NOT_READY)
|
||||||
|
@ -1632,13 +1740,15 @@ static void
|
||||||
nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
|
nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
|
||||||
{
|
{
|
||||||
CUdeviceptr dptr;
|
CUdeviceptr dptr;
|
||||||
CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
|
CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
|
||||||
|
module, "__nvptx_clocktick");
|
||||||
if (r == CUDA_ERROR_NOT_FOUND)
|
if (r == CUDA_ERROR_NOT_FOUND)
|
||||||
return;
|
return;
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
|
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
|
||||||
double __nvptx_clocktick = 1e-3 / dev->clock_khz;
|
double __nvptx_clocktick = 1e-3 / dev->clock_khz;
|
||||||
r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
|
r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
|
||||||
|
sizeof (__nvptx_clocktick));
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
|
||||||
}
|
}
|
||||||
|
@ -1761,7 +1871,7 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
|
||||||
if (image->target_data == target_data)
|
if (image->target_data == target_data)
|
||||||
{
|
{
|
||||||
*prev_p = image->next;
|
*prev_p = image->next;
|
||||||
if (cuModuleUnload (image->module) != CUDA_SUCCESS)
|
if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
|
||||||
ret = false;
|
ret = false;
|
||||||
free (image->fns);
|
free (image->fns);
|
||||||
free (image);
|
free (image);
|
||||||
|
@ -1974,7 +2084,7 @@ static void *
|
||||||
nvptx_stacks_alloc (size_t size, int num)
|
nvptx_stacks_alloc (size_t size, int num)
|
||||||
{
|
{
|
||||||
CUdeviceptr stacks;
|
CUdeviceptr stacks;
|
||||||
CUresult r = cuMemAlloc (&stacks, size * num);
|
CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
|
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
|
||||||
return (void *) stacks;
|
return (void *) stacks;
|
||||||
|
@ -1985,7 +2095,7 @@ nvptx_stacks_alloc (size_t size, int num)
|
||||||
static void
|
static void
|
||||||
nvptx_stacks_free (void *p, int num)
|
nvptx_stacks_free (void *p, int num)
|
||||||
{
|
{
|
||||||
CUresult r = cuMemFree ((CUdeviceptr) p);
|
CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
|
GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
|
||||||
}
|
}
|
||||||
|
@ -2028,14 +2138,13 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
|
CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
|
||||||
CU_LAUNCH_PARAM_END
|
CU_LAUNCH_PARAM_END
|
||||||
};
|
};
|
||||||
r = cuLaunchKernel (function,
|
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
|
||||||
teams, 1, 1,
|
32, threads, 1, 0, ptx_dev->null_stream->stream,
|
||||||
32, threads, 1,
|
NULL, config);
|
||||||
0, ptx_dev->null_stream->stream, NULL, config);
|
|
||||||
if (r != CUDA_SUCCESS)
|
if (r != CUDA_SUCCESS)
|
||||||
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
|
||||||
|
|
||||||
r = cuCtxSynchronize ();
|
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
||||||
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
if (r == CUDA_ERROR_LAUNCH_FAILED)
|
||||||
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
|
||||||
maybe_abort_msg);
|
maybe_abort_msg);
|
||||||
|
|
Loading…
Reference in New Issue