omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO & SEQ.

gcc/
	* gcc/omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO &
	SEQ.
	(oacc_loop_fixed_partitions): Correct return type to bool.
	(oacc_loop_auto_partitions): New.
	(oacc_loop_partition): Take mask argument, call
	oacc_loop_auto_partitions.
	(execute_oacc_device_lower): Provide mask to oacc_loop_partition.

	gcc/testsuite/
	* c-c++-common/goacc/loop-auto-1.c: New.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New.

From-SVN: r230354
This commit is contained in:
Nathan Sidwell 2015-11-13 21:51:32 +00:00 committed by Nathan Sidwell
parent 7bcc3c8608
commit c5a64cfec7
6 changed files with 532 additions and 17 deletions

View File

@ -1,3 +1,13 @@
2015-11-13 Nathan Sidwell <nathan@codesourcery.com>
* gcc/omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO &
SEQ.
(oacc_loop_fixed_partitions): Correct return type to bool.
(oacc_loop_auto_partitions): New.
(oacc_loop_partition): Take mask argument, call
oacc_loop_auto_partitions.
(execute_oacc_device_lower): Provide mask to oacc_loop_partition.
2015-11-13 Michael Meissner <meissner@linux.vnet.ibm.com>
* config/rs6000/constraints.md (we constraint): New constraint for

View File

@ -2124,6 +2124,9 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
case OMP_CLAUSE_WORKER:
case OMP_CLAUSE_VECTOR:
case OMP_CLAUSE_TILE:
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
break;
case OMP_CLAUSE_ALIGNED:
@ -2136,9 +2139,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
case OMP_CLAUSE_DEVICE_RESIDENT:
case OMP_CLAUSE_USE_DEVICE:
case OMP_CLAUSE__CACHE_:
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
sorry ("Clause not supported yet");
break;
@ -2299,14 +2299,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
case OMP_CLAUSE_WORKER:
case OMP_CLAUSE_VECTOR:
case OMP_CLAUSE_TILE:
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
break;
case OMP_CLAUSE_DEVICE_RESIDENT:
case OMP_CLAUSE_USE_DEVICE:
case OMP_CLAUSE__CACHE_:
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
sorry ("Clause not supported yet");
break;
@ -19230,10 +19230,10 @@ oacc_loop_process (oacc_loop *loop)
/* Walk the OpenACC loop heirarchy checking and assigning the
programmer-specified partitionings. OUTER_MASK is the partitioning
this loop is contained within. Return partitiong mask used within
this loop nest. */
this loop is contained within. Return true if we contain an
auto-partitionable loop. */
static unsigned
static bool
oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
{
unsigned this_mask = loop->mask;
@ -19337,18 +19337,63 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
return has_auto;
}
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
OUTER_MASK is the partitioning this loop is contained within.
Return the cumulative partitioning used by this loop, siblings and
children. */
static unsigned
oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
{
unsigned inner_mask = 0;
bool noisy = true;
#ifdef ACCEL_COMPILER
/* When device_type is supported, we want the device compiler to be
noisy, if the loop parameters are device_type-specific. */
noisy = false;
#endif
if (loop->child)
inner_mask |= oacc_loop_auto_partitions (loop->child,
outer_mask | loop->mask);
if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT))
{
unsigned this_mask = 0;
/* Determine the outermost partitioning used within this loop. */
this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX);
this_mask = (this_mask & -this_mask);
/* Pick the partitioning just inside that one. */
this_mask >>= 1;
/* And avoid picking one use by an outer loop. */
this_mask &= ~outer_mask;
if (!this_mask && noisy)
warning_at (loop->loc, 0,
"insufficient partitioning available to parallelize loop");
loop->mask = this_mask;
}
inner_mask |= loop->mask;
if (loop->sibling)
inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
return inner_mask;
}
/* Walk the OpenACC loop heirarchy to check and assign partitioning
axes. */
static void
oacc_loop_partition (oacc_loop *loop, int fn_level)
oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
{
unsigned outer_mask = 0;
if (fn_level >= 0)
outer_mask = GOMP_DIM_MASK (fn_level) - 1;
oacc_loop_fixed_partitions (loop, outer_mask);
if (oacc_loop_fixed_partitions (loop, outer_mask))
oacc_loop_auto_partitions (loop, outer_mask);
}
/* Default fork/join early expander. Delete the function calls if
@ -19429,7 +19474,8 @@ execute_oacc_device_lower ()
/* Discover, partition and process the loops. */
oacc_loop *loops = oacc_loop_discovery ();
oacc_loop_partition (loops, fn_level);
unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
oacc_loop_partition (loops, outer_mask);
oacc_loop_process (loops);
if (dump_file)
{

View File

@ -1,5 +1,7 @@
2015-11-13 Nathan Sidwell <nathan@codesourcery.com>
* c-c++-common/goacc/loop-auto-1.c: New.
* lib/target-supports.exp (check_effective_target_offload_nvptx): New.
* gcc.dg/goacc/nvptx-merged-loop.c: New.

View File

@ -0,0 +1,230 @@
void Foo ()
{
#pragma acc parallel num_gangs(10) num_workers(32) vector_length(32)
{
#pragma acc loop vector
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop seq
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++) {}
}
#pragma acc loop worker
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop vector
for (int kx = 0; kx < 10; kx++) {}
}
}
#pragma acc loop gang
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop worker
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop vector
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop vector
for (int kx = 0; kx < 10; kx++) {}
}
}
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
}
}
}
#pragma acc routine gang
void Gang (void)
{
#pragma acc loop vector
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop seq
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++) {}
}
#pragma acc loop worker
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop vector
for (int kx = 0; kx < 10; kx++) {}
}
}
#pragma acc loop gang
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop worker
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop vector
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int kx = 0; kx < 10; kx++) {}
}
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop vector
for (int kx = 0; kx < 10; kx++) {}
}
}
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
}
}
#pragma acc routine worker
void Worker (void)
{
#pragma acc loop vector
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop seq
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++) {}
}
#pragma acc loop worker
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop vector
for (int kx = 0; kx < 10; kx++) {}
}
}
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++)
{
#pragma acc loop auto
for (int kx = 0; kx < 10; kx++) {}
}
}
}
#pragma acc routine vector
void Vector (void)
{
#pragma acc loop vector
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop seq
for (int jx = 0; jx < 10; jx++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int jx = 0; jx < 10; jx++) {}
}
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++) {}
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int ix = 0; ix < 10; ix++)
{
#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
}
}
#pragma acc routine seq
void Seq (void)
{
#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int ix = 0; ix < 10; ix++) {}
}

View File

@ -1,5 +1,7 @@
2015-11-13 Nathan Sidwell <nathan@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New.
* testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Sequential
loop is sequential.

View File

@ -0,0 +1,225 @@
/* { dg-do run } */
/* { dg-additional-options "-O2" */
#include <stdio.h>
#include <openacc.h>
int check (const int *ary, int size, int gp, int wp, int vp)
{
int exit = 0;
int ix;
int gangs[32], workers[32], vectors[32];
for (ix = 0; ix < 32; ix++)
gangs[ix] = workers[ix] = vectors[ix] = 0;
for (ix = 0; ix < size; ix++)
{
vectors[ary[ix] & 0xff]++;
workers[(ary[ix] >> 8) & 0xff]++;
gangs[(ary[ix] >> 16) & 0xff]++;
}
for (ix = 0; ix < 32; ix++)
{
if (gp)
{
int expect = gangs[0];
if (gangs[ix] != expect)
{
exit = 1;
printf ("gang %d not used %d times\n", ix, expect);
}
}
else if (ix && gangs[ix])
{
exit = 1;
printf ("gang %d unexpectedly used\n", ix);
}
if (wp)
{
int expect = workers[0];
if (workers[ix] != expect)
{
exit = 1;
printf ("worker %d not used %d times\n", ix, expect);
}
}
else if (ix && workers[ix])
{
exit = 1;
printf ("worker %d unexpectedly used\n", ix);
}
if (vp)
{
int expect = vectors[0];
if (vectors[ix] != expect)
{
exit = 1;
printf ("vector %d not used %d times\n", ix, expect);
}
}
else if (ix && vectors[ix])
{
exit = 1;
printf ("vector %d unexpectedly used\n", ix);
}
}
return exit;
}
#pragma acc routine seq
static int __attribute__((noinline)) place ()
{
int r = 0;
if (acc_on_device (acc_device_nvidia))
{
int g = 0, w = 0, v = 0;
__asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
__asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
__asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
r = (g << 16) | (w << 8) | v;
}
return r;
}
static void clear (int *ary, int size)
{
int ix;
for (ix = 0; ix < size; ix++)
ary[ix] = -1;
}
int vector_1 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop auto
for (int ix = 0; ix < size; ix++)
ary[ix] = place ();
}
return check (ary, size, 0, 0, 1);
}
int vector_2 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop worker
for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop auto
for (int ix = 0; ix < 64; ix++)
ary[ix + jx * 64] = place ();
}
return check (ary, size, 0, 1, 1);
}
int worker_1 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop auto
for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop vector
for (int ix = 0; ix < 64; ix++)
ary[ix + jx * 64] = place ();
}
return check (ary, size, 0, 1, 1);
}
int worker_2 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop auto
for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop auto
for (int ix = 0; ix < 64; ix++)
ary[ix + jx * 64] = place ();
}
return check (ary, size, 0, 1, 1);
}
int gang_1 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop auto
for (int jx = 0; jx < size / 64; jx++)
#pragma acc loop worker
for (int ix = 0; ix < 64; ix++)
ary[ix + jx * 64] = place ();
}
return check (ary, size, 1, 1, 0);
}
int gang_2 (int *ary, int size)
{
clear (ary, size);
#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
{
#pragma acc loop auto
for (int kx = 0; kx < size / (32 * 32); kx++)
#pragma acc loop auto
for (int jx = 0; jx < 32; jx++)
#pragma acc loop auto
for (int ix = 0; ix < 32; ix++)
ary[ix + jx * 32 + kx * 32 * 32] = place ();
}
return check (ary, size, 1, 1, 1);
}
#define N (32*32*32)
int main ()
{
int ondev = 0;
#pragma acc parallel copy(ondev)
{
ondev = acc_on_device (acc_device_not_host);
}
if (!ondev)
return 0;
int ary[N];
if (vector_1 (ary, N))
return 1;
if (vector_2 (ary, N))
return 1;
if (worker_1 (ary, N))
return 1;
if (worker_2 (ary, N))
return 1;
if (gang_1 (ary, N))
return 1;
if (gang_2 (ary, N))
return 1;
return 0;
}