nvptx.c (PTX_GANG_DEFAULT): New.
gcc/ * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): New. (nvptx_goacc_validate_dims): Extend to handle global defaults. * target.def (OACC_VALIDATE_DIMS): Extend documentation. * doc/tm.texti: Rebuilt. * doc/invoke.texi (fopenacc-dim): Document. * lto-wrapper.c (merge_and_complain): Add OPT_fopenacc_dim_ case. (append_compiler_options): Likewise. * omp-low.c (oacc_default_dims, oacc_min_dims): New. (oacc_parse_default_dims): New. (oacc_validate_dims): Add USED arg. Select non-unity default when possible. (oacc_loop_fixed_partitions): Return mask of used partitions. (oacc_loop_auto_partitions): Emit dump info. (oacc_loop_partition): Return mask of used partitions. (execute_oacc_device_lower): Parse default dimension arg. Adjust loop partitioning and validation calls. gcc/c-family/ * c.opt (fopenacc-dim=): New option. gcc/fortran/ * lang.opt (fopenacc-dim=): New option. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: New. * testsuite/libgomp.oacc-fortran/routine-7.f90: Serialize loop. From-SVN: r233041
This commit is contained in:
parent
ff86345f83
commit
b6adbb9faa
@ -1,3 +1,22 @@
|
||||
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
|
||||
|
||||
* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): New.
|
||||
(nvptx_goacc_validate_dims): Extend to handle global defaults.
|
||||
* target.def (OACC_VALIDATE_DIMS): Extend documentation.
|
||||
* doc/tm.texti: Rebuilt.
|
||||
* doc/invoke.texi (fopenacc-dim): Document.
|
||||
* lto-wrapper.c (merge_and_complain): Add OPT_fopenacc_dim_ case.
|
||||
(append_compiler_options): Likewise.
|
||||
* omp-low.c (oacc_default_dims, oacc_min_dims): New.
|
||||
(oacc_parse_default_dims): New.
|
||||
(oacc_validate_dims): Add USED arg. Select non-unity default when
|
||||
possible.
|
||||
(oacc_loop_fixed_partitions): Return mask of used partitions.
|
||||
(oacc_loop_auto_partitions): Emit dump info.
|
||||
(oacc_loop_partition): Return mask of used partitions.
|
||||
(execute_oacc_device_lower): Parse default dimension arg. Adjust
|
||||
loop partitioning and validation calls.
|
||||
|
||||
2016-02-01 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR middle-end/69556
|
||||
|
@ -1,3 +1,7 @@
|
||||
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
|
||||
|
||||
* c.opt (fopenacc-dim=): New option.
|
||||
|
||||
2016-01-27 Ryan Burn <contact@rnburn.com>
|
||||
|
||||
PR cilkplus/69267
|
||||
|
@ -1372,6 +1372,10 @@ fopenacc
|
||||
C ObjC C++ ObjC++ LTO Var(flag_openacc)
|
||||
Enable OpenACC.
|
||||
|
||||
fopenacc-dim=
|
||||
C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
|
||||
Specify default OpenACC compute dimensions.
|
||||
|
||||
fopenmp
|
||||
C ObjC C++ ObjC++ LTO Var(flag_openmp)
|
||||
Enable OpenMP (implies -frecursive in Fortran).
|
||||
|
@ -4122,10 +4122,12 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
|
||||
/* Define dimension sizes for known hardware. */
|
||||
#define PTX_VECTOR_LENGTH 32
|
||||
#define PTX_WORKER_LENGTH 32
|
||||
#define PTX_GANG_DEFAULT 32
|
||||
|
||||
/* Validate compute dimensions of an OpenACC offload or routine, fill
|
||||
in non-unity defaults. FN_LEVEL indicates the level at which a
|
||||
routine might spawn a loop. It is negative for non-routines. */
|
||||
routine might spawn a loop. It is negative for non-routines. If
|
||||
DECL is null, we are validating the default dimensions. */
|
||||
|
||||
static bool
|
||||
nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
|
||||
@ -4133,11 +4135,12 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
|
||||
bool changed = false;
|
||||
|
||||
/* The vector size must be 32, unless this is a SEQ routine. */
|
||||
if (fn_level <= GOMP_DIM_VECTOR
|
||||
if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
|
||||
&& dims[GOMP_DIM_VECTOR] >= 0
|
||||
&& dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
|
||||
{
|
||||
if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
|
||||
warning_at (DECL_SOURCE_LOCATION (decl), 0,
|
||||
if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
|
||||
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
|
||||
dims[GOMP_DIM_VECTOR]
|
||||
? "using vector_length (%d), ignoring %d"
|
||||
: "using vector_length (%d), ignoring runtime setting",
|
||||
@ -4149,13 +4152,23 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
|
||||
/* Check the num workers is not too large. */
|
||||
if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
|
||||
{
|
||||
warning_at (DECL_SOURCE_LOCATION (decl), 0,
|
||||
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
|
||||
"using num_workers (%d), ignoring %d",
|
||||
PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
|
||||
dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
|
||||
changed = true;
|
||||
}
|
||||
|
||||
if (!decl)
|
||||
{
|
||||
dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
|
||||
if (dims[GOMP_DIM_WORKER] < 0)
|
||||
dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
|
||||
if (dims[GOMP_DIM_GANG] < 0)
|
||||
dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
|
||||
changed = true;
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
|
@ -1963,9 +1963,13 @@ Programming Interface v2.0 @w{@uref{http://www.openacc.org/}}. This option
|
||||
implies @option{-pthread}, and thus is only supported on targets that
|
||||
have support for @option{-pthread}.
|
||||
|
||||
Note that this is an experimental feature, incomplete, and subject to
|
||||
change in future versions of GCC. See
|
||||
@w{@uref{https://gcc.gnu.org/wiki/OpenACC}} for more information.
|
||||
@item -fopenacc-dim=@var{geom}
|
||||
@opindex fopenacc-dim
|
||||
@cindex OpenACC accelerator programming
|
||||
Specify default compute dimensions for parallel offload regions that do
|
||||
not explicitly specify. The @var{geom} value is a triple of
|
||||
':'-separated sizes, in order 'gang', 'worker' and, 'vector'. A size
|
||||
can be omitted, to use a target-specific default value.
|
||||
|
||||
@item -fopenmp
|
||||
@opindex fopenmp
|
||||
|
@ -5767,11 +5767,12 @@ to use it.
|
||||
@deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
|
||||
This hook should check the launch dimensions provided for an OpenACC
|
||||
compute region, or routine. Defaulted values are represented as -1
|
||||
and non-constant values as 0. The @var{fn_level} is negative for the
|
||||
and non-constant values as 0. The @var{fn_level} is negative for the
|
||||
function corresponding to the compute region. For a routine is is the
|
||||
outermost level at which partitioned execution may be spawned. It
|
||||
should fill in anything that needs to default to non-unity and verify
|
||||
non-defaults. Diagnostics should be issued as appropriate. Return
|
||||
outermost level at which partitioned execution may be spawned. The hook
|
||||
should verify non-default values. If DECL is NULL, global defaults
|
||||
are being validated and unspecified defaults should be filled in.
|
||||
Diagnostics should be issued as appropriate. Return
|
||||
true, if changes have been made. You must override this hook to
|
||||
provide dimensions larger than 1.
|
||||
@end deftypefn
|
||||
|
@ -1,3 +1,7 @@
|
||||
2016-02-02 Nathan Sidwell <nathan@codesourcery.com>
|
||||
|
||||
* lang.opt (fopenacc-dim=): New option.
|
||||
|
||||
2016-01-31 Paul Thomas <pault@gcc.gnu.org>
|
||||
|
||||
PR fortran/67564
|
||||
|
@ -578,6 +578,10 @@ fopenacc
|
||||
Fortran LTO
|
||||
; Documented in C
|
||||
|
||||
fopenacc-dim=
|
||||
Fortran LTO Joined Var(flag_openacc_dims)
|
||||
; Documented in C
|
||||
|
||||
fopenmp
|
||||
Fortran LTO
|
||||
; Documented in C
|
||||
|
@ -287,12 +287,25 @@ merge_and_complain (struct cl_decoded_option **decoded_options,
|
||||
append_option (decoded_options, decoded_options_count, foption);
|
||||
/* -fmath-errno > -fno-math-errno,
|
||||
-fsigned-zeros > -fno-signed-zeros,
|
||||
-ftrapping-math -> -fno-trapping-math,
|
||||
-ftrapping-math > -fno-trapping-math,
|
||||
-fwrapv > -fno-wrapv. */
|
||||
else if (foption->value > (*decoded_options)[j].value)
|
||||
(*decoded_options)[j] = *foption;
|
||||
break;
|
||||
|
||||
case OPT_fopenacc_dim_:
|
||||
/* Append or check identical. */
|
||||
for (j = 0; j < *decoded_options_count; ++j)
|
||||
if ((*decoded_options)[j].opt_index == foption->opt_index)
|
||||
break;
|
||||
if (j == *decoded_options_count)
|
||||
append_option (decoded_options, decoded_options_count, foption);
|
||||
else if (strcmp ((*decoded_options)[j].arg, foption->arg))
|
||||
fatal_error (input_location,
|
||||
"Option %s with different values",
|
||||
foption->orig_option_with_args_text);
|
||||
break;
|
||||
|
||||
case OPT_freg_struct_return:
|
||||
case OPT_fpcc_struct_return:
|
||||
case OPT_fshort_double:
|
||||
@ -506,6 +519,7 @@ append_compiler_options (obstack *argv_obstack, struct cl_decoded_option *opts,
|
||||
case OPT_fwrapv:
|
||||
case OPT_fopenmp:
|
||||
case OPT_fopenacc:
|
||||
case OPT_fopenacc_dim_:
|
||||
case OPT_fcilkplus:
|
||||
case OPT_ftrapv:
|
||||
case OPT_fstrict_overflow:
|
||||
|
152
gcc/omp-low.c
152
gcc/omp-low.c
@ -20238,13 +20238,80 @@ oacc_xform_loop (gcall *call)
|
||||
gsi_replace_with_seq (&gsi, seq, true);
|
||||
}
|
||||
|
||||
/* Default partitioned and minimum partitioned dimensions. */
|
||||
|
||||
static int oacc_default_dims[GOMP_DIM_MAX];
|
||||
static int oacc_min_dims[GOMP_DIM_MAX];
|
||||
|
||||
/* Parse the default dimension parameter. This is a set of
|
||||
:-separated optional compute dimensions. Each specified dimension
|
||||
is a positive integer. When device type support is added, it is
|
||||
planned to be a comma separated list of such compute dimensions,
|
||||
with all but the first prefixed by the colon-terminated device
|
||||
type. */
|
||||
|
||||
static void
|
||||
oacc_parse_default_dims (const char *dims)
|
||||
{
|
||||
int ix;
|
||||
|
||||
for (ix = GOMP_DIM_MAX; ix--;)
|
||||
{
|
||||
oacc_default_dims[ix] = -1;
|
||||
oacc_min_dims[ix] = 1;
|
||||
}
|
||||
|
||||
#ifndef ACCEL_COMPILER
|
||||
/* Cannot be overridden on the host. */
|
||||
dims = NULL;
|
||||
#endif
|
||||
if (dims)
|
||||
{
|
||||
const char *pos = dims;
|
||||
|
||||
for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
|
||||
{
|
||||
if (ix)
|
||||
{
|
||||
if (*pos != ':')
|
||||
goto malformed;
|
||||
pos++;
|
||||
}
|
||||
|
||||
if (*pos != ':')
|
||||
{
|
||||
long val;
|
||||
const char *eptr;
|
||||
|
||||
errno = 0;
|
||||
val = strtol (pos, CONST_CAST (char **, &eptr), 10);
|
||||
if (errno || val <= 0 || (unsigned)val != val)
|
||||
goto malformed;
|
||||
pos = eptr;
|
||||
oacc_default_dims[ix] = (int)val;
|
||||
}
|
||||
}
|
||||
if (*pos)
|
||||
{
|
||||
malformed:
|
||||
error_at (UNKNOWN_LOCATION,
|
||||
"-fopenacc-dim operand is malformed at '%s'", pos);
|
||||
}
|
||||
}
|
||||
|
||||
/* Allow the backend to validate the dimensions. */
|
||||
targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
|
||||
targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
|
||||
}
|
||||
|
||||
/* Validate and update the dimensions for offloaded FN. ATTRS is the
|
||||
raw attribute. DIMS is an array of dimensions, which is filled in.
|
||||
LEVEL is the partitioning level of a routine, or -1 for an offload
|
||||
region itself. */
|
||||
region itself. USED is the mask of partitioned execution in the
|
||||
function. */
|
||||
|
||||
static void
|
||||
oacc_validate_dims (tree fn, tree attrs, int *dims, int level)
|
||||
oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
|
||||
{
|
||||
tree purpose[GOMP_DIM_MAX];
|
||||
unsigned ix;
|
||||
@ -20265,11 +20332,29 @@ oacc_validate_dims (tree fn, tree attrs, int *dims, int level)
|
||||
|
||||
bool changed = targetm.goacc.validate_dims (fn, dims, level);
|
||||
|
||||
/* Default anything left to 1. */
|
||||
/* Default anything left to 1 or a partitioned default. */
|
||||
for (ix = 0; ix != GOMP_DIM_MAX; ix++)
|
||||
if (dims[ix] < 0)
|
||||
{
|
||||
dims[ix] = 1;
|
||||
/* The OpenACC spec says 'If the [num_gangs] clause is not
|
||||
specified, an implementation-defined default will be used;
|
||||
the default may depend on the code within the construct.'
|
||||
(2.5.6). Thus an implementation is free to choose
|
||||
non-unity default for a parallel region that doesn't have
|
||||
any gang-partitioned loops. However, it appears that there
|
||||
is a sufficient body of user code that expects non-gang
|
||||
partitioned regions to not execute in gang-redundant mode.
|
||||
So we (a) don't warn about the non-portability and (b) pick
|
||||
the minimum permissible dimension size when there is no
|
||||
partitioned execution. Otherwise we pick the global
|
||||
default for the dimension, which the user can control. The
|
||||
same wording and logic applies to num_workers and
|
||||
vector_length, however the worker- or vector- single
|
||||
execution doesn't have the same impact as gang-redundant
|
||||
execution. (If the minimum gang-level partioning is not 1,
|
||||
the target is probably too confusing.) */
|
||||
dims[ix] = (used & GOMP_DIM_MASK (ix)
|
||||
? oacc_default_dims[ix] : oacc_min_dims[ix]);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
@ -20719,14 +20804,15 @@ oacc_loop_process (oacc_loop *loop)
|
||||
|
||||
/* Walk the OpenACC loop heirarchy checking and assigning the
|
||||
programmer-specified partitionings. OUTER_MASK is the partitioning
|
||||
this loop is contained within. Return true if we contain an
|
||||
auto-partitionable loop. */
|
||||
this loop is contained within. Return mask of partitioning
|
||||
encountered. If any auto loops are discovered, set GOMP_DIM_MAX
|
||||
bit. */
|
||||
|
||||
static bool
|
||||
static unsigned
|
||||
oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
|
||||
{
|
||||
unsigned this_mask = loop->mask;
|
||||
bool has_auto = false;
|
||||
unsigned mask_all = 0;
|
||||
bool noisy = true;
|
||||
|
||||
#ifdef ACCEL_COMPILER
|
||||
@ -20760,7 +20846,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
|
||||
}
|
||||
}
|
||||
if (auto_par && (loop->flags & OLF_INDEPENDENT))
|
||||
has_auto = true;
|
||||
mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
||||
}
|
||||
|
||||
if (this_mask & outer_mask)
|
||||
@ -20814,16 +20900,16 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
|
||||
}
|
||||
|
||||
loop->mask = this_mask;
|
||||
mask_all |= this_mask;
|
||||
|
||||
if (loop->child)
|
||||
mask_all |= oacc_loop_fixed_partitions (loop->child,
|
||||
outer_mask | this_mask);
|
||||
|
||||
if (loop->child
|
||||
&& oacc_loop_fixed_partitions (loop->child, outer_mask | this_mask))
|
||||
has_auto = true;
|
||||
if (loop->sibling)
|
||||
mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
|
||||
|
||||
if (loop->sibling
|
||||
&& oacc_loop_fixed_partitions (loop->sibling, outer_mask))
|
||||
has_auto = true;
|
||||
|
||||
return has_auto;
|
||||
return mask_all;
|
||||
}
|
||||
|
||||
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
|
||||
@ -20865,6 +20951,11 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
|
||||
warning_at (loop->loc, 0,
|
||||
"insufficient partitioning available to parallelize loop");
|
||||
|
||||
if (dump_file)
|
||||
fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
|
||||
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
|
||||
this_mask);
|
||||
|
||||
loop->mask = this_mask;
|
||||
}
|
||||
inner_mask |= loop->mask;
|
||||
@ -20876,13 +20967,19 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
|
||||
}
|
||||
|
||||
/* Walk the OpenACC loop heirarchy to check and assign partitioning
|
||||
axes. */
|
||||
axes. Return mask of partitioning. */
|
||||
|
||||
static void
|
||||
static unsigned
|
||||
oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
|
||||
{
|
||||
if (oacc_loop_fixed_partitions (loop, outer_mask))
|
||||
oacc_loop_auto_partitions (loop, outer_mask);
|
||||
unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
|
||||
|
||||
if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
|
||||
{
|
||||
mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
||||
mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
|
||||
}
|
||||
return mask_all;
|
||||
}
|
||||
|
||||
/* Default fork/join early expander. Delete the function calls if
|
||||
@ -20958,6 +21055,13 @@ execute_oacc_device_lower ()
|
||||
/* Not an offloaded function. */
|
||||
return 0;
|
||||
|
||||
/* Parse the default dim argument exactly once. */
|
||||
if ((const void *)flag_openacc_dims != &flag_openacc_dims)
|
||||
{
|
||||
oacc_parse_default_dims (flag_openacc_dims);
|
||||
flag_openacc_dims = (char *)&flag_openacc_dims;
|
||||
}
|
||||
|
||||
/* Discover, partition and process the loops. */
|
||||
oacc_loop *loops = oacc_loop_discovery ();
|
||||
int fn_level = oacc_fn_attrib_level (attrs);
|
||||
@ -20969,10 +21073,10 @@ execute_oacc_device_lower ()
|
||||
: "Function is routine level %d\n", fn_level);
|
||||
|
||||
unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
|
||||
oacc_loop_partition (loops, outer_mask);
|
||||
|
||||
unsigned used_mask = oacc_loop_partition (loops, outer_mask);
|
||||
int dims[GOMP_DIM_MAX];
|
||||
oacc_validate_dims (current_function_decl, attrs, dims, fn_level);
|
||||
|
||||
oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
|
||||
|
||||
if (dump_file)
|
||||
{
|
||||
|
@ -1648,11 +1648,12 @@ DEFHOOK
|
||||
(validate_dims,
|
||||
"This hook should check the launch dimensions provided for an OpenACC\n\
|
||||
compute region, or routine. Defaulted values are represented as -1\n\
|
||||
and non-constant values as 0. The @var{fn_level} is negative for the\n\
|
||||
and non-constant values as 0. The @var{fn_level} is negative for the\n\
|
||||
function corresponding to the compute region. For a routine is is the\n\
|
||||
outermost level at which partitioned execution may be spawned. It\n\
|
||||
should fill in anything that needs to default to non-unity and verify\n\
|
||||
non-defaults. Diagnostics should be issued as appropriate. Return\n\
|
||||
outermost level at which partitioned execution may be spawned. The hook\n\
|
||||
should verify non-default values. If DECL is NULL, global defaults\n\
|
||||
are being validated and unspecified defaults should be filled in.\n\
|
||||
Diagnostics should be issued as appropriate. Return\n\
|
||||
true, if changes have been made. You must override this hook to\n\
|
||||
provide dimensions larger than 1.",
|
||||
bool, (tree decl, int *dims, int fn_level),
|
||||
|
@ -1,3 +1,8 @@
|
||||
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
|
||||
|
||||
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: New.
|
||||
* testsuite/libgomp.oacc-fortran/routine-7.f90: Serialize loop.
|
||||
|
||||
2016-01-26 Tom de Vries <tom@codesourcery.com>
|
||||
|
||||
PR tree-optimization/69110
|
||||
|
133
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
Normal file
133
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
Normal file
@ -0,0 +1,133 @@
|
||||
|
||||
/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
|
||||
|
||||
#include <openacc.h>
|
||||
#include <alloca.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#pragma acc routine
|
||||
static int __attribute__ ((noinline)) coord ()
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
if (acc_on_device (acc_device_nvidia))
|
||||
{
|
||||
int g = 0, w = 0, v = 0;
|
||||
|
||||
__asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
|
||||
__asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
|
||||
__asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
|
||||
res = (1 << 24) | (g << 16) | (w << 8) | v;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int check (const int *ary, int size, int gp, int wp, int vp)
|
||||
{
|
||||
int exit = 0;
|
||||
int ix;
|
||||
int *gangs = (int *)alloca (gp * sizeof (int));
|
||||
int *workers = (int *)alloca (wp * sizeof (int));
|
||||
int *vectors = (int *)alloca (vp * sizeof (int));
|
||||
int offloaded = 0;
|
||||
|
||||
memset (gangs, 0, gp * sizeof (int));
|
||||
memset (workers, 0, wp * sizeof (int));
|
||||
memset (vectors, 0, vp * sizeof (int));
|
||||
|
||||
for (ix = 0; ix < size; ix++)
|
||||
{
|
||||
int g = (ary[ix] >> 16) & 0xff;
|
||||
int w = (ary[ix] >> 8) & 0xff;
|
||||
int v = (ary[ix] >> 0) & 0xff;
|
||||
|
||||
if (g >= gp || w >= wp || v >= vp)
|
||||
{
|
||||
printf ("unexpected cpu %#x used\n", ary[ix]);
|
||||
exit = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
vectors[v]++;
|
||||
workers[w]++;
|
||||
gangs[g]++;
|
||||
}
|
||||
offloaded += ary[ix] >> 24;
|
||||
}
|
||||
|
||||
if (!offloaded)
|
||||
return 0;
|
||||
|
||||
if (offloaded != size)
|
||||
{
|
||||
printf ("offloaded %d times, expected %d\n", offloaded, size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (ix = 0; ix < gp; ix++)
|
||||
if (gangs[ix] != gangs[0])
|
||||
{
|
||||
printf ("gang %d not used %d times\n", ix, gangs[0]);
|
||||
exit = 1;
|
||||
}
|
||||
|
||||
for (ix = 0; ix < wp; ix++)
|
||||
if (workers[ix] != workers[0])
|
||||
{
|
||||
printf ("worker %d not used %d times\n", ix, workers[0]);
|
||||
exit = 1;
|
||||
}
|
||||
|
||||
for (ix = 0; ix < vp; ix++)
|
||||
if (vectors[ix] != vectors[0])
|
||||
{
|
||||
printf ("vector %d not used %d times\n", ix, vectors[0]);
|
||||
exit = 1;
|
||||
}
|
||||
|
||||
return exit;
|
||||
}
|
||||
|
||||
#define N (32 *32*32)
|
||||
|
||||
int test_1 (int gp, int wp, int vp)
|
||||
{
|
||||
int ary[N];
|
||||
int exit = 0;
|
||||
|
||||
#pragma acc parallel copyout (ary)
|
||||
{
|
||||
#pragma acc loop gang (static:1)
|
||||
for (int ix = 0; ix < N; ix++)
|
||||
ary[ix] = coord ();
|
||||
}
|
||||
|
||||
exit |= check (ary, N, gp, 1, 1);
|
||||
|
||||
#pragma acc parallel copyout (ary)
|
||||
{
|
||||
#pragma acc loop worker
|
||||
for (int ix = 0; ix < N; ix++)
|
||||
ary[ix] = coord ();
|
||||
}
|
||||
|
||||
exit |= check (ary, N, 1, wp, 1);
|
||||
|
||||
#pragma acc parallel copyout (ary)
|
||||
{
|
||||
#pragma acc loop vector
|
||||
for (int ix = 0; ix < N; ix++)
|
||||
ary[ix] = coord ();
|
||||
}
|
||||
|
||||
exit |= check (ary, N, 1, 1, vp);
|
||||
|
||||
return exit;
|
||||
}
|
||||
|
||||
int main ()
|
||||
{
|
||||
return test_1 (16, 16, 32);
|
||||
}
|
@ -41,7 +41,7 @@ program main
|
||||
end do
|
||||
|
||||
!$acc parallel copy (b)
|
||||
!$acc loop
|
||||
!$acc loop seq
|
||||
do i = 1, N
|
||||
call worker (b)
|
||||
end do
|
||||
@ -56,7 +56,7 @@ program main
|
||||
end do
|
||||
|
||||
!$acc parallel copy (a)
|
||||
!$acc loop
|
||||
!$acc loop seq
|
||||
do i = 1, N
|
||||
call vector (a)
|
||||
end do
|
||||
|
Loading…
Reference in New Issue
Block a user