nvptx.c (PTX_GANG_DEFAULT): Set to zero.
gcc/ * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Set to zero. libgomp/ * plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes to determine default geometry. * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Set gang dimension. Co-Authored-By: Nathan Sidwell <nathan@acm.org> From-SVN: r241803
This commit is contained in:
parent
bd17a9dfab
commit
6668eb4593
@ -1,3 +1,8 @@
|
||||
2016-11-02 Cesar Philippidis <cesar@codesourcery.com>
|
||||
Nathan Sidwell <nathan@acm.org>
|
||||
|
||||
* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Set to zero.
|
||||
|
||||
2016-11-02 Max Filippov <jcmvbkbc@gmail.com>
|
||||
|
||||
* config/xtensa/xtensa.c (xtensa_output_integer_literal_parts):
|
||||
|
@ -4174,7 +4174,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
|
||||
/* Define dimension sizes for known hardware. */
|
||||
#define PTX_VECTOR_LENGTH 32
|
||||
#define PTX_WORKER_LENGTH 32
|
||||
#define PTX_GANG_DEFAULT 32
|
||||
#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
|
||||
|
||||
/* Validate compute dimensions of an OpenACC offload or routine, fill
|
||||
in non-unity defaults. FN_LEVEL indicates the level at which a
|
||||
|
@ -1,3 +1,11 @@
|
||||
2016-11-02 Cesar Philippidis <cesar@codesourcery.com>
|
||||
Nathan Sidwell <nathan@acm.org>
|
||||
|
||||
* plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
|
||||
to determine default geometry.
|
||||
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Set gang
|
||||
dimension.
|
||||
|
||||
2016-11-01 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
* hashtab.h: Use standard GPLv3 with runtime exception
|
||||
|
@ -45,6 +45,7 @@
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
|
||||
static const char *
|
||||
cuda_error (CUresult r)
|
||||
@ -932,9 +933,88 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||
|
||||
if (seen_zero)
|
||||
{
|
||||
/* See if the user provided GOMP_OPENACC_DIM environment
|
||||
variable to specify runtime defaults. */
|
||||
static int default_dims[GOMP_DIM_MAX];
|
||||
|
||||
pthread_mutex_lock (&ptx_dev_lock);
|
||||
if (!default_dims[0])
|
||||
{
|
||||
/* We only read the environment variable once. You can't
|
||||
change it in the middle of execution. The syntax is
|
||||
the same as for the -fopenacc-dim compilation option. */
|
||||
const char *env_var = getenv ("GOMP_OPENACC_DIM");
|
||||
if (env_var)
|
||||
{
|
||||
const char *pos = env_var;
|
||||
|
||||
for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
|
||||
{
|
||||
if (i && *pos++ != ':')
|
||||
break;
|
||||
if (*pos != ':')
|
||||
{
|
||||
const char *eptr;
|
||||
|
||||
errno = 0;
|
||||
long val = strtol (pos, (char **)&eptr, 10);
|
||||
if (errno || val < 0 || (unsigned)val != val)
|
||||
break;
|
||||
default_dims[i] = (int)val;
|
||||
pos = eptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int warp_size, block_size, dev_size, cpu_size;
|
||||
CUdevice dev = nvptx_thread()->ptx_dev->dev;
|
||||
/* 32 is the default for known hardware. */
|
||||
int gang = 0, worker = 32, vector = 32;
|
||||
CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
|
||||
|
||||
cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
|
||||
cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
|
||||
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
|
||||
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
|
||||
|
||||
if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
|
||||
&& cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
|
||||
&& cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
|
||||
&& cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS)
|
||||
{
|
||||
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
||||
" dev_size=%d, cpu_size=%d\n",
|
||||
warp_size, block_size, dev_size, cpu_size);
|
||||
gang = (cpu_size / block_size) * dev_size;
|
||||
worker = block_size / warp_size;
|
||||
vector = warp_size;
|
||||
}
|
||||
|
||||
/* There is no upper bound on the gang size. The best size
|
||||
matches the hardware configuration. Logical gangs are
|
||||
scheduled onto physical hardware. To maximize usage, we
|
||||
should guess a large number. */
|
||||
if (default_dims[GOMP_DIM_GANG] < 1)
|
||||
default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
|
||||
/* The worker size must not exceed the hardware. */
|
||||
if (default_dims[GOMP_DIM_WORKER] < 1
|
||||
|| (default_dims[GOMP_DIM_WORKER] > worker && gang))
|
||||
default_dims[GOMP_DIM_WORKER] = worker;
|
||||
/* The vector size must exactly match the hardware. */
|
||||
if (default_dims[GOMP_DIM_VECTOR] < 1
|
||||
|| (default_dims[GOMP_DIM_VECTOR] != vector && gang))
|
||||
default_dims[GOMP_DIM_VECTOR] = vector;
|
||||
|
||||
GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
|
||||
default_dims[GOMP_DIM_GANG],
|
||||
default_dims[GOMP_DIM_WORKER],
|
||||
default_dims[GOMP_DIM_VECTOR]);
|
||||
}
|
||||
pthread_mutex_unlock (&ptx_dev_lock);
|
||||
|
||||
for (i = 0; i != GOMP_DIM_MAX; i++)
|
||||
if (!dims[i])
|
||||
dims[i] = /* TODO */ 32;
|
||||
if (!dims[i])
|
||||
dims[i] = default_dims[i];
|
||||
}
|
||||
|
||||
/* This reserves a chunk of a pre-allocated page of memory mapped on both
|
||||
@ -954,8 +1034,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
|
||||
mapnum * sizeof (void *));
|
||||
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
||||
" gangs=%u, workers=%u, vectors=%u\n",
|
||||
__FUNCTION__, targ_fn->launch->fn,
|
||||
dims[0], dims[1], dims[2]);
|
||||
__FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
|
||||
dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
|
||||
|
||||
// OpenACC CUDA
|
||||
//
|
||||
|
@ -2,6 +2,8 @@
|
||||
not optimized away at -O0, and then confuses the target assembler.
|
||||
{ dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
|
||||
|
||||
/* { dg-additional-options "-fopenacc-dim=32" } */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <openacc.h>
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user