31dd69b7ff
gcc/testsuite/ * c-c++-common/goacc/deviceptr-4.c: New file. * c-c++-common/goacc/kernels-counter-var-redundant-load.c: Likewise. * c-c++-common/goacc/kernels-loop-data-2.c: Likewise. * c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: Likewise. * c-c++-common/goacc/kernels-loop-data-enter-exit.c: Likewise. * c-c++-common/goacc/kernels-loop-data-update.c: Likewise. * c-c++-common/goacc/kernels-loop-data.c: Likewise. * c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c: Likewise. * c-c++-common/goacc/parallel-reduction.c: Likewise. * c-c++-common/goacc/private-reduction-1.c: Likewise. * gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95: Likewise. * gfortran.dg/goacc/modules.f95: Likewise. * gfortran.dg/goacc/routine-8.f90: Likewise. * gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Don't force "-O2". * testsuite/libgomp.oacc-c-c++-common/data-2.c: Update. * testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise. * testsuite/libgomp.oacc-fortran/data-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-2.f90: Likewise. * testsuite/libgomp.oacc-c++/non-scalar-data.C: New file. * testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/enter-data.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h: Likewise. * testsuite/libgomp.oacc-fortran/cublas-fixed.h: Likewise. * testsuite/libgomp.oacc-fortran/dummy-array.f90: Likewise. * testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/host_data-3.f: Likewise. * testsuite/libgomp.oacc-fortran/host_data-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-independent.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-loop-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-map-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-13.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-14.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-15.f90: Likewise. * testsuite/libgomp.oacc-fortran/parallel-loop-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/reference-reductions.f90: Likewise. * testsuite/libgomp.oacc-fortran/vector-routine.f90: Likewise. Co-Authored-By: James Norris <jnorris@codesourcery.com> Co-Authored-By: Julian Brown <julian@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com> Co-Authored-By: Tom de Vries <tom@codesourcery.com> From-SVN: r261884
1166 lines
21 KiB
C
1166 lines
21 KiB
C
/* Miscellaneous test cases for gang/worker/vector mode transitions. */
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <openacc.h>
|
|
|
|
|
|
/* Test basic vector-partitioned mode transitions. */
|
|
|
|
void t1()
|
|
{
|
|
int n = 0, arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(1) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
n++;
|
|
#pragma acc loop vector
|
|
for (j = 0; j < 32; j++)
|
|
arr[j]++;
|
|
n++;
|
|
}
|
|
|
|
assert (n == 2);
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test vector-partitioned, gang-partitioned mode. */
|
|
|
|
void t2()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test conditional vector-partitioned loops. */
|
|
|
|
void t3()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
if ((j % 2) == 0)
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
}
|
|
else
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]--;
|
|
}
|
|
}
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 64) < 32) ? 1 : -1);
|
|
}
|
|
|
|
|
|
/* Test conditions inside vector-partitioned loops. */
|
|
|
|
void t4()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
if ((arr[j * 32 + k] % 2) != 0)
|
|
arr[j * 32 + k] *= 2;
|
|
}
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
|
|
}
|
|
|
|
|
|
/* Test conditions inside gang-partitioned/vector-partitioned loops. */
|
|
|
|
void t5()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang vector
|
|
for (j = 0; j < 1024; j++)
|
|
if ((arr[j] % 2) != 0)
|
|
arr[j] *= 2;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
|
|
}
|
|
|
|
|
|
/* Test switch containing vector-partitioned loops inside gang-partitioned
|
|
loops. */
|
|
|
|
void t6()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = i % 5;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
switch (n[j])
|
|
{
|
|
case 1:
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k] += 1;
|
|
break;
|
|
|
|
case 2:
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k] += 2;
|
|
break;
|
|
|
|
case 3:
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k] += 3;
|
|
break;
|
|
|
|
case 4:
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k] += 4;
|
|
break;
|
|
|
|
case 5:
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k] += 5;
|
|
break;
|
|
|
|
default:
|
|
abort ();
|
|
}
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == (i % 5) + 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i / 32) % 5) + 1);
|
|
}
|
|
|
|
|
|
/* Test trivial operation of vector-single mode. */
|
|
|
|
void t7()
|
|
{
|
|
int n = 0;
|
|
#pragma acc parallel copy(n) \
|
|
num_gangs(1) num_workers(1) vector_length(32)
|
|
{
|
|
n++;
|
|
}
|
|
assert (n == 1);
|
|
}
|
|
|
|
|
|
/* Test vector-single, gang-partitioned mode. */
|
|
|
|
void t8()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
arr[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test conditions in vector-single mode. */
|
|
|
|
void t9()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
if ((j % 3) == 0)
|
|
arr[j]++;
|
|
else
|
|
arr[j] += 2;
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 3) == 0) ? 1 : 2);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test switch in vector-single mode. */
|
|
|
|
void t10()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
switch (j % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == (i % 5) + 1);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test switch in vector-single mode, initialise array on device. */
|
|
|
|
void t11()
|
|
{
|
|
int arr[1024];
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 99;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(1024) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
/* This loop and the one following must be distributed to available gangs
|
|
in the same way to ensure data dependencies are not violated (hence the
|
|
"static" clauses). */
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 1024; j++)
|
|
arr[j] = 0;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 1024; j++)
|
|
switch (j % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == (i % 5) + 1);
|
|
}
|
|
|
|
|
|
/* Test multiple conditions in vector-single mode. */
|
|
|
|
#define NUM_GANGS 4096
|
|
void t12()
|
|
{
|
|
bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS];
|
|
int i;
|
|
|
|
#pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
|
|
num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
/* This loop and the one following must be distributed to available gangs
|
|
in the same way to ensure data dependencies are not violated (hence the
|
|
"static" clauses). */
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < NUM_GANGS; j++)
|
|
fizz[j] = buzz[j] = fizzbuzz[j] = 0;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < NUM_GANGS; j++)
|
|
{
|
|
if ((j % 3) == 0 && (j % 5) == 0)
|
|
fizzbuzz[j] = 1;
|
|
else
|
|
{
|
|
if ((j % 3) == 0)
|
|
fizz[j] = 1;
|
|
else if ((j % 5) == 0)
|
|
buzz[j] = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < NUM_GANGS; i++)
|
|
{
|
|
assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0));
|
|
assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0));
|
|
assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0));
|
|
}
|
|
}
|
|
#undef NUM_GANGS
|
|
|
|
|
|
/* Test worker-partitioned/vector-single mode. */
|
|
|
|
void t13()
|
|
{
|
|
int arr[32 * 8], i;
|
|
|
|
for (i = 0; i < 32 * 8; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 8; k++)
|
|
arr[j * 8 + k] += j * 8 + k;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 8; i++)
|
|
assert (arr[i] == i);
|
|
}
|
|
|
|
|
|
/* Test condition in worker-partitioned mode. */
|
|
|
|
void t14()
|
|
{
|
|
int arr[32 * 32 * 8], i;
|
|
|
|
for (i = 0; i < 32 * 32 * 8; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 8; k++)
|
|
{
|
|
int m;
|
|
if ((k % 2) == 0)
|
|
{
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
arr[j * 32 * 8 + k * 32 + m]++;
|
|
}
|
|
else
|
|
{
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
arr[j * 32 * 8 + k * 32 + m] += 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32 * 8; i++)
|
|
assert (arr[i] == i + ((i / 32) % 2) + 1);
|
|
}
|
|
|
|
|
|
/* Test switch in worker-partitioned mode. */
|
|
|
|
void t15()
|
|
{
|
|
int arr[32 * 32 * 8], i;
|
|
|
|
for (i = 0; i < 32 * 32 * 8; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 8; k++)
|
|
{
|
|
int m;
|
|
switch ((j * 32 + k) % 3)
|
|
{
|
|
case 0:
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
arr[j * 32 * 8 + k * 32 + m]++;
|
|
break;
|
|
|
|
case 1:
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
arr[j * 32 * 8 + k * 32 + m] += 2;
|
|
break;
|
|
|
|
case 2:
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
arr[j * 32 * 8 + k * 32 + m] += 3;
|
|
break;
|
|
|
|
default: ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32 * 8; i++)
|
|
assert (arr[i] == i + ((i / 32) % 3) + 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single/worker-partitioned transitions. */
|
|
|
|
void t16()
|
|
{
|
|
int n[32], arr[32 * 32], i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(8) num_workers(16) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 4);
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr[i] == 3);
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between worker-partitioned loops. */
|
|
|
|
void t17()
|
|
{
|
|
int arr_a[32 * 32], arr_b[32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr_b[i] == (i ^ 31) * 8);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between worker+vector-partitioned loops. */
|
|
|
|
void t18()
|
|
{
|
|
int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
assert (arr_b[i] == (i ^ 1023) * 8);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between vector-partitioned loops in
|
|
worker-partitioned mode. */
|
|
|
|
void t19()
|
|
{
|
|
int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
int m;
|
|
|
|
n[j * 32 + k]++;
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 2) == 0)
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 2;
|
|
else
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 3;
|
|
}
|
|
|
|
/* Test returning to vector-single mode... */
|
|
n[j * 32 + k]++;
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 3) == 0)
|
|
arr_a[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_b[j * 1024 + k * 32 + m] * 5;
|
|
else
|
|
arr_a[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_b[j * 1024 + k * 32 + m] * 7;
|
|
}
|
|
|
|
/* ...and back-to-back vector loops. */
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 2) == 0)
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 3;
|
|
else
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
{
|
|
int m = 6 * ((i % 3) == 0 ? 5 : 7);
|
|
assert (arr_b[i] == (i ^ 31) * m);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* With -O0, variables are on the stack, not in registers. Check that worker
|
|
state propagation handles the stack frame. */
|
|
|
|
void t20()
|
|
{
|
|
int w0 = 0;
|
|
int w1 = 0;
|
|
int w2 = 0;
|
|
int w3 = 0;
|
|
int w4 = 0;
|
|
int w5 = 0;
|
|
int w6 = 0;
|
|
int w7 = 0;
|
|
|
|
int i;
|
|
|
|
#pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
|
|
num_gangs (1) num_workers (8)
|
|
{
|
|
int internal = 100;
|
|
|
|
#pragma acc loop worker
|
|
for (i = 0; i < 8; i++)
|
|
{
|
|
switch (i)
|
|
{
|
|
case 0: w0 = internal; break;
|
|
case 1: w1 = internal; break;
|
|
case 2: w2 = internal; break;
|
|
case 3: w3 = internal; break;
|
|
case 4: w4 = internal; break;
|
|
case 5: w5 = internal; break;
|
|
case 6: w6 = internal; break;
|
|
case 7: w7 = internal; break;
|
|
default: break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (w0 != 100
|
|
|| w1 != 100
|
|
|| w2 != 100
|
|
|| w3 != 100
|
|
|| w4 != 100
|
|
|| w5 != 100
|
|
|| w6 != 100
|
|
|| w7 != 100)
|
|
__builtin_abort ();
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-single mode. */
|
|
|
|
void t21()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
arr[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-single mode. */
|
|
|
|
void t22()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j]++;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test condition in worker-single/vector-single mode. */
|
|
|
|
void t23()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
if ((arr[j] % 2) != 0)
|
|
arr[j]++;
|
|
else
|
|
arr[j] += 2;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
|
|
}
|
|
|
|
|
|
/* Test switch in worker-single/vector-single mode. */
|
|
|
|
void t24()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
switch (arr[j] % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == i + (i % 5) + 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-partitioned mode. */
|
|
|
|
void t25()
|
|
{
|
|
int arr[32 * 32], i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j * 32 + k]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr[i] == i + 1);
|
|
}
|
|
|
|
|
|
/* Test multiple conditional vector-partitioned loops in worker-single
|
|
mode. */
|
|
|
|
void t26()
|
|
{
|
|
int arr[32 * 32], i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
if ((j % 3) == 0)
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j * 32 + k] += 3;
|
|
}
|
|
}
|
|
else if ((j % 3) == 1)
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j * 32 + k] += 7;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
{
|
|
int j = (i / 32) % 3;
|
|
assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
|
|
}
|
|
}
|
|
|
|
|
|
/* Test worker-single, vector-partitioned, gang-redundant mode. */
|
|
|
|
#define ACTUAL_GANGS 8
|
|
void t27()
|
|
{
|
|
int n, arr[32], i;
|
|
int ondev;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
n = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) copyout(ondev) \
|
|
num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
ondev = acc_on_device (acc_device_not_host);
|
|
|
|
#pragma acc atomic
|
|
n++;
|
|
|
|
#pragma acc loop vector
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j] += 1;
|
|
}
|
|
|
|
#pragma acc atomic
|
|
n++;
|
|
}
|
|
|
|
int m = ondev ? ACTUAL_GANGS : 1;
|
|
|
|
assert (n == m * 2);
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == m);
|
|
}
|
|
#undef ACTUAL_GANGS
|
|
|
|
|
|
/* Check if worker-single variables get broadcastd to vectors. */
|
|
|
|
#pragma acc routine
|
|
float t28_routine ()
|
|
{
|
|
return 2.71;
|
|
}
|
|
|
|
#define N 32
|
|
void t28()
|
|
{
|
|
float threads[N], v1 = 3.14;
|
|
|
|
for (int i = 0; i < N; i++)
|
|
threads[i] = -1;
|
|
|
|
#pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
|
|
{
|
|
float val = t28_routine ();
|
|
|
|
#pragma acc loop vector
|
|
for (int i = 0; i < N; i++)
|
|
threads[i] = val + v1*i;
|
|
}
|
|
|
|
for (int i = 0; i < N; i++)
|
|
assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001);
|
|
}
|
|
#undef N
|
|
|
|
|
|
int main()
|
|
{
|
|
t1();
|
|
t2();
|
|
t3();
|
|
t4();
|
|
t5();
|
|
t6();
|
|
t7();
|
|
t8();
|
|
t9();
|
|
t10();
|
|
t11();
|
|
t12();
|
|
t13();
|
|
t14();
|
|
t15();
|
|
t16();
|
|
t17();
|
|
t18();
|
|
t19();
|
|
t20();
|
|
t21();
|
|
t22();
|
|
t23();
|
|
t24();
|
|
t25();
|
|
t26();
|
|
t27();
|
|
t28();
|
|
|
|
return 0;
|
|
}
|