nvptx.c (PTX_GANG_DEFAULT): New.

gcc/
	* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): New.
	(nvptx_goacc_validate_dims): Extend to handle global defaults.
	* target.def (OACC_VALIDATE_DIMS): Extend documentation.
	* doc/tm.texti: Rebuilt.
	* doc/invoke.texi (fopenacc-dim): Document.
	* lto-wrapper.c (merge_and_complain): Add OPT_fopenacc_dim_ case.
	(append_compiler_options): Likewise.
	* omp-low.c (oacc_default_dims, oacc_min_dims): New.
	(oacc_parse_default_dims): New.
	(oacc_validate_dims): Add USED arg.  Select non-unity default when
	possible.
	(oacc_loop_fixed_partitions): Return mask of used partitions.
	(oacc_loop_auto_partitions): Emit dump info.
	(oacc_loop_partition): Return mask of used partitions.
	(execute_oacc_device_lower): Parse default dimension arg.  Adjust
	loop partitioning and validation calls.

	gcc/c-family/
	* c.opt (fopenacc-dim=): New option.

	gcc/fortran/
	* lang.opt (fopenacc-dim=): New option.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: New.
	* testsuite/libgomp.oacc-fortran/routine-7.f90: Serialize loop.

From-SVN: r233041
This commit is contained in:
Nathan Sidwell 2016-02-01 16:20:13 +00:00 committed by Nathan Sidwell
parent ff86345f83
commit b6adbb9faa
14 changed files with 353 additions and 43 deletions

View File

@ -1,3 +1,22 @@
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): New.
(nvptx_goacc_validate_dims): Extend to handle global defaults.
* target.def (OACC_VALIDATE_DIMS): Extend documentation.
* doc/tm.texti: Rebuilt.
* doc/invoke.texi (fopenacc-dim): Document.
* lto-wrapper.c (merge_and_complain): Add OPT_fopenacc_dim_ case.
(append_compiler_options): Likewise.
* omp-low.c (oacc_default_dims, oacc_min_dims): New.
(oacc_parse_default_dims): New.
(oacc_validate_dims): Add USED arg. Select non-unity default when
possible.
(oacc_loop_fixed_partitions): Return mask of used partitions.
(oacc_loop_auto_partitions): Emit dump info.
(oacc_loop_partition): Return mask of used partitions.
(execute_oacc_device_lower): Parse default dimension arg. Adjust
loop partitioning and validation calls.
2016-02-01 Richard Biener <rguenther@suse.de>
PR middle-end/69556

View File

@ -1,3 +1,7 @@
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
* c.opt (fopenacc-dim=): New option.
2016-01-27 Ryan Burn <contact@rnburn.com>
PR cilkplus/69267

View File

@ -1372,6 +1372,10 @@ fopenacc
C ObjC C++ ObjC++ LTO Var(flag_openacc)
Enable OpenACC.
fopenacc-dim=
C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
Specify default OpenACC compute dimensions.
fopenmp
C ObjC C++ ObjC++ LTO Var(flag_openmp)
Enable OpenMP (implies -frecursive in Fortran).

View File

@ -4122,10 +4122,12 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
/* Define dimension sizes for known hardware. */
#define PTX_VECTOR_LENGTH 32
#define PTX_WORKER_LENGTH 32
#define PTX_GANG_DEFAULT 32
/* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults. FN_LEVEL indicates the level at which a
routine might spawn a loop. It is negative for non-routines. */
routine might spawn a loop. It is negative for non-routines. If
DECL is null, we are validating the default dimensions. */
static bool
nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
@ -4133,11 +4135,12 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
bool changed = false;
/* The vector size must be 32, unless this is a SEQ routine. */
if (fn_level <= GOMP_DIM_VECTOR
if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
&& dims[GOMP_DIM_VECTOR] >= 0
&& dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
{
if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
warning_at (DECL_SOURCE_LOCATION (decl), 0,
if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
dims[GOMP_DIM_VECTOR]
? "using vector_length (%d), ignoring %d"
: "using vector_length (%d), ignoring runtime setting",
@ -4149,13 +4152,23 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
/* Check the num workers is not too large. */
if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
{
warning_at (DECL_SOURCE_LOCATION (decl), 0,
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
"using num_workers (%d), ignoring %d",
PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
changed = true;
}
if (!decl)
{
dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
if (dims[GOMP_DIM_WORKER] < 0)
dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
changed = true;
}
return changed;
}

View File

@ -1963,9 +1963,13 @@ Programming Interface v2.0 @w{@uref{http://www.openacc.org/}}. This option
implies @option{-pthread}, and thus is only supported on targets that
have support for @option{-pthread}.
Note that this is an experimental feature, incomplete, and subject to
change in future versions of GCC. See
@w{@uref{https://gcc.gnu.org/wiki/OpenACC}} for more information.
@item -fopenacc-dim=@var{geom}
@opindex fopenacc-dim
@cindex OpenACC accelerator programming
Specify default compute dimensions for parallel offload regions that do
not explicitly specify. The @var{geom} value is a triple of
':'-separated sizes, in order 'gang', 'worker' and, 'vector'. A size
can be omitted, to use a target-specific default value.
@item -fopenmp
@opindex fopenmp

View File

@ -5767,11 +5767,12 @@ to use it.
@deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
This hook should check the launch dimensions provided for an OpenACC
compute region, or routine. Defaulted values are represented as -1
and non-constant values as 0. The @var{fn_level} is negative for the
and non-constant values as 0. The @var{fn_level} is negative for the
function corresponding to the compute region. For a routine is is the
outermost level at which partitioned execution may be spawned. It
should fill in anything that needs to default to non-unity and verify
non-defaults. Diagnostics should be issued as appropriate. Return
outermost level at which partitioned execution may be spawned. The hook
should verify non-default values. If DECL is NULL, global defaults
are being validated and unspecified defaults should be filled in.
Diagnostics should be issued as appropriate. Return
true, if changes have been made. You must override this hook to
provide dimensions larger than 1.
@end deftypefn

View File

@ -1,3 +1,7 @@
2016-02-02 Nathan Sidwell <nathan@codesourcery.com>
* lang.opt (fopenacc-dim=): New option.
2016-01-31 Paul Thomas <pault@gcc.gnu.org>
PR fortran/67564

View File

@ -578,6 +578,10 @@ fopenacc
Fortran LTO
; Documented in C
fopenacc-dim=
Fortran LTO Joined Var(flag_openacc_dims)
; Documented in C
fopenmp
Fortran LTO
; Documented in C

View File

@ -287,12 +287,25 @@ merge_and_complain (struct cl_decoded_option **decoded_options,
append_option (decoded_options, decoded_options_count, foption);
/* -fmath-errno > -fno-math-errno,
-fsigned-zeros > -fno-signed-zeros,
-ftrapping-math -> -fno-trapping-math,
-ftrapping-math > -fno-trapping-math,
-fwrapv > -fno-wrapv. */
else if (foption->value > (*decoded_options)[j].value)
(*decoded_options)[j] = *foption;
break;
case OPT_fopenacc_dim_:
/* Append or check identical. */
for (j = 0; j < *decoded_options_count; ++j)
if ((*decoded_options)[j].opt_index == foption->opt_index)
break;
if (j == *decoded_options_count)
append_option (decoded_options, decoded_options_count, foption);
else if (strcmp ((*decoded_options)[j].arg, foption->arg))
fatal_error (input_location,
"Option %s with different values",
foption->orig_option_with_args_text);
break;
case OPT_freg_struct_return:
case OPT_fpcc_struct_return:
case OPT_fshort_double:
@ -506,6 +519,7 @@ append_compiler_options (obstack *argv_obstack, struct cl_decoded_option *opts,
case OPT_fwrapv:
case OPT_fopenmp:
case OPT_fopenacc:
case OPT_fopenacc_dim_:
case OPT_fcilkplus:
case OPT_ftrapv:
case OPT_fstrict_overflow:

View File

@ -20238,13 +20238,80 @@ oacc_xform_loop (gcall *call)
gsi_replace_with_seq (&gsi, seq, true);
}
/* Default partitioned and minimum partitioned dimensions. */
static int oacc_default_dims[GOMP_DIM_MAX];
static int oacc_min_dims[GOMP_DIM_MAX];
/* Parse the default dimension parameter. This is a set of
:-separated optional compute dimensions. Each specified dimension
is a positive integer. When device type support is added, it is
planned to be a comma separated list of such compute dimensions,
with all but the first prefixed by the colon-terminated device
type. */
static void
oacc_parse_default_dims (const char *dims)
{
int ix;
for (ix = GOMP_DIM_MAX; ix--;)
{
oacc_default_dims[ix] = -1;
oacc_min_dims[ix] = 1;
}
#ifndef ACCEL_COMPILER
/* Cannot be overridden on the host. */
dims = NULL;
#endif
if (dims)
{
const char *pos = dims;
for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
{
if (ix)
{
if (*pos != ':')
goto malformed;
pos++;
}
if (*pos != ':')
{
long val;
const char *eptr;
errno = 0;
val = strtol (pos, CONST_CAST (char **, &eptr), 10);
if (errno || val <= 0 || (unsigned)val != val)
goto malformed;
pos = eptr;
oacc_default_dims[ix] = (int)val;
}
}
if (*pos)
{
malformed:
error_at (UNKNOWN_LOCATION,
"-fopenacc-dim operand is malformed at '%s'", pos);
}
}
/* Allow the backend to validate the dimensions. */
targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
}
/* Validate and update the dimensions for offloaded FN. ATTRS is the
raw attribute. DIMS is an array of dimensions, which is filled in.
LEVEL is the partitioning level of a routine, or -1 for an offload
region itself. */
region itself. USED is the mask of partitioned execution in the
function. */
static void
oacc_validate_dims (tree fn, tree attrs, int *dims, int level)
oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
{
tree purpose[GOMP_DIM_MAX];
unsigned ix;
@ -20265,11 +20332,29 @@ oacc_validate_dims (tree fn, tree attrs, int *dims, int level)
bool changed = targetm.goacc.validate_dims (fn, dims, level);
/* Default anything left to 1. */
/* Default anything left to 1 or a partitioned default. */
for (ix = 0; ix != GOMP_DIM_MAX; ix++)
if (dims[ix] < 0)
{
dims[ix] = 1;
/* The OpenACC spec says 'If the [num_gangs] clause is not
specified, an implementation-defined default will be used;
the default may depend on the code within the construct.'
(2.5.6). Thus an implementation is free to choose
non-unity default for a parallel region that doesn't have
any gang-partitioned loops. However, it appears that there
is a sufficient body of user code that expects non-gang
partitioned regions to not execute in gang-redundant mode.
So we (a) don't warn about the non-portability and (b) pick
the minimum permissible dimension size when there is no
partitioned execution. Otherwise we pick the global
default for the dimension, which the user can control. The
same wording and logic applies to num_workers and
vector_length, however the worker- or vector- single
execution doesn't have the same impact as gang-redundant
execution. (If the minimum gang-level partioning is not 1,
the target is probably too confusing.) */
dims[ix] = (used & GOMP_DIM_MASK (ix)
? oacc_default_dims[ix] : oacc_min_dims[ix]);
changed = true;
}
@ -20719,14 +20804,15 @@ oacc_loop_process (oacc_loop *loop)
/* Walk the OpenACC loop heirarchy checking and assigning the
programmer-specified partitionings. OUTER_MASK is the partitioning
this loop is contained within. Return true if we contain an
auto-partitionable loop. */
this loop is contained within. Return mask of partitioning
encountered. If any auto loops are discovered, set GOMP_DIM_MAX
bit. */
static bool
static unsigned
oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
{
unsigned this_mask = loop->mask;
bool has_auto = false;
unsigned mask_all = 0;
bool noisy = true;
#ifdef ACCEL_COMPILER
@ -20760,7 +20846,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
}
}
if (auto_par && (loop->flags & OLF_INDEPENDENT))
has_auto = true;
mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
}
if (this_mask & outer_mask)
@ -20814,16 +20900,16 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
}
loop->mask = this_mask;
mask_all |= this_mask;
if (loop->child)
mask_all |= oacc_loop_fixed_partitions (loop->child,
outer_mask | this_mask);
if (loop->child
&& oacc_loop_fixed_partitions (loop->child, outer_mask | this_mask))
has_auto = true;
if (loop->sibling)
mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
if (loop->sibling
&& oacc_loop_fixed_partitions (loop->sibling, outer_mask))
has_auto = true;
return has_auto;
return mask_all;
}
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
@ -20865,6 +20951,11 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
warning_at (loop->loc, 0,
"insufficient partitioning available to parallelize loop");
if (dump_file)
fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
this_mask);
loop->mask = this_mask;
}
inner_mask |= loop->mask;
@ -20876,13 +20967,19 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
}
/* Walk the OpenACC loop heirarchy to check and assign partitioning
axes. */
axes. Return mask of partitioning. */
static void
static unsigned
oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
{
if (oacc_loop_fixed_partitions (loop, outer_mask))
oacc_loop_auto_partitions (loop, outer_mask);
unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
{
mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
}
return mask_all;
}
/* Default fork/join early expander. Delete the function calls if
@ -20958,6 +21055,13 @@ execute_oacc_device_lower ()
/* Not an offloaded function. */
return 0;
/* Parse the default dim argument exactly once. */
if ((const void *)flag_openacc_dims != &flag_openacc_dims)
{
oacc_parse_default_dims (flag_openacc_dims);
flag_openacc_dims = (char *)&flag_openacc_dims;
}
/* Discover, partition and process the loops. */
oacc_loop *loops = oacc_loop_discovery ();
int fn_level = oacc_fn_attrib_level (attrs);
@ -20969,10 +21073,10 @@ execute_oacc_device_lower ()
: "Function is routine level %d\n", fn_level);
unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
oacc_loop_partition (loops, outer_mask);
unsigned used_mask = oacc_loop_partition (loops, outer_mask);
int dims[GOMP_DIM_MAX];
oacc_validate_dims (current_function_decl, attrs, dims, fn_level);
oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
if (dump_file)
{

View File

@ -1648,11 +1648,12 @@ DEFHOOK
(validate_dims,
"This hook should check the launch dimensions provided for an OpenACC\n\
compute region, or routine. Defaulted values are represented as -1\n\
and non-constant values as 0. The @var{fn_level} is negative for the\n\
and non-constant values as 0. The @var{fn_level} is negative for the\n\
function corresponding to the compute region. For a routine is is the\n\
outermost level at which partitioned execution may be spawned. It\n\
should fill in anything that needs to default to non-unity and verify\n\
non-defaults. Diagnostics should be issued as appropriate. Return\n\
outermost level at which partitioned execution may be spawned. The hook\n\
should verify non-default values. If DECL is NULL, global defaults\n\
are being validated and unspecified defaults should be filled in.\n\
Diagnostics should be issued as appropriate. Return\n\
true, if changes have been made. You must override this hook to\n\
provide dimensions larger than 1.",
bool, (tree decl, int *dims, int fn_level),

View File

@ -1,3 +1,8 @@
2016-02-01 Nathan Sidwell <nathan@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: New.
* testsuite/libgomp.oacc-fortran/routine-7.f90: Serialize loop.
2016-01-26 Tom de Vries <tom@codesourcery.com>
PR tree-optimization/69110

View File

@ -0,0 +1,133 @@
/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
#include <openacc.h>
#include <alloca.h>
#include <string.h>
#include <stdio.h>
#pragma acc routine
static int __attribute__ ((noinline)) coord ()
{
int res = 0;
if (acc_on_device (acc_device_nvidia))
{
int g = 0, w = 0, v = 0;
__asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
__asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
__asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
res = (1 << 24) | (g << 16) | (w << 8) | v;
}
return res;
}
int check (const int *ary, int size, int gp, int wp, int vp)
{
int exit = 0;
int ix;
int *gangs = (int *)alloca (gp * sizeof (int));
int *workers = (int *)alloca (wp * sizeof (int));
int *vectors = (int *)alloca (vp * sizeof (int));
int offloaded = 0;
memset (gangs, 0, gp * sizeof (int));
memset (workers, 0, wp * sizeof (int));
memset (vectors, 0, vp * sizeof (int));
for (ix = 0; ix < size; ix++)
{
int g = (ary[ix] >> 16) & 0xff;
int w = (ary[ix] >> 8) & 0xff;
int v = (ary[ix] >> 0) & 0xff;
if (g >= gp || w >= wp || v >= vp)
{
printf ("unexpected cpu %#x used\n", ary[ix]);
exit = 1;
}
else
{
vectors[v]++;
workers[w]++;
gangs[g]++;
}
offloaded += ary[ix] >> 24;
}
if (!offloaded)
return 0;
if (offloaded != size)
{
printf ("offloaded %d times, expected %d\n", offloaded, size);
return 1;
}
for (ix = 0; ix < gp; ix++)
if (gangs[ix] != gangs[0])
{
printf ("gang %d not used %d times\n", ix, gangs[0]);
exit = 1;
}
for (ix = 0; ix < wp; ix++)
if (workers[ix] != workers[0])
{
printf ("worker %d not used %d times\n", ix, workers[0]);
exit = 1;
}
for (ix = 0; ix < vp; ix++)
if (vectors[ix] != vectors[0])
{
printf ("vector %d not used %d times\n", ix, vectors[0]);
exit = 1;
}
return exit;
}
#define N (32 *32*32)
int test_1 (int gp, int wp, int vp)
{
int ary[N];
int exit = 0;
#pragma acc parallel copyout (ary)
{
#pragma acc loop gang (static:1)
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, gp, 1, 1);
#pragma acc parallel copyout (ary)
{
#pragma acc loop worker
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, 1, wp, 1);
#pragma acc parallel copyout (ary)
{
#pragma acc loop vector
for (int ix = 0; ix < N; ix++)
ary[ix] = coord ();
}
exit |= check (ary, N, 1, 1, vp);
return exit;
}
int main ()
{
return test_1 (16, 16, 32);
}

View File

@ -41,7 +41,7 @@ program main
end do
!$acc parallel copy (b)
!$acc loop
!$acc loop seq
do i = 1, N
call worker (b)
end do
@ -56,7 +56,7 @@ program main
end do
!$acc parallel copy (a)
!$acc loop
!$acc loop seq
do i = 1, N
call vector (a)
end do