2620c80db0
gcc/testsuite/ * c-c++-common/goacc/combined-directives.c: Clean up dg-* directives. * c-c++-common/goacc/loop-clauses.c: Likewise. * g++.dg/goacc/template.C: Likewise. * gfortran.dg/goacc/combined-directives.f90: Likewise. * gfortran.dg/goacc/loop-1.f95: Likewise. * gfortran.dg/goacc/loop-5.f95: Likewise. * gfortran.dg/goacc/loop-6.f95: Likewise. * gfortran.dg/goacc/loop-tree-1.f90: Likewise. * c-c++-common/goacc-gomp/nesting-1.c: Update. * c-c++-common/goacc-gomp/nesting-fail-1.c: Likewise. * c-c++-common/goacc/clauses-fail.c: Likewise. * c-c++-common/goacc/parallel-1.c: Likewise. * c-c++-common/goacc/reduction-1.c: Likewise. * c-c++-common/goacc/reduction-2.c: Likewise. * c-c++-common/goacc/reduction-3.c: Likewise. * c-c++-common/goacc/reduction-4.c: Likewise. * c-c++-common/goacc/routine-3.c: Likewise. * c-c++-common/goacc/routine-4.c: Likewise. * c-c++-common/goacc/routine-5.c: Likewise. * c-c++-common/goacc/tile.c: Likewise. * g++.dg/goacc/template.C: Likewise. * gfortran.dg/goacc/combined-directives.f90: Likewise. * c-c++-common/goacc/nesting-1.c: Move dg-error test cases into... * c-c++-common/goacc/nesting-fail-1.c: ... this file. Update. * c-c++-common/goacc/kernels-1.c: Update. Incorporate... * c-c++-common/goacc/kernels-empty.c: ... this file, and... * c-c++-common/goacc/kernels-eternal.c: ... this file, and... * c-c++-common/goacc/kernels-noreturn.c: ... this file. * c-c++-common/goacc/host_data-1.c: New file. Incorporate... * c-c++-common/goacc/use_device-1.c: ... this file. * c-c++-common/goacc/host_data-2.c: New file. Incorporate... * c-c++-common/goacc/host_data-5.c: ... this file, and... * c-c++-common/goacc/host_data-6.c: ... this file. * c-c++-common/goacc/loop-2-kernels.c: New file. * c-c++-common/goacc/loop-2-parallel.c: Likewise. * c-c++-common/goacc/loop-3.c: Likewise. * g++.dg/goacc/reference.C: Likewise. * g++.dg/goacc/routine-1.C: Likewise. * g++.dg/goacc/routine-2.C: Likewise. libgomp/ * testsuite/libgomp.oacc-c-c++-common/clauses-1.c: Update. * testsuite/libgomp.oacc-c-c++-common/deviceptr-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/if-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/vector-loop.c: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/declare-1.f90: Likewise. * testsuite/libgomp.oacc-c-c++-common/asyncwait-1.c: Likewise. XFAIL. * testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c: Update. Incorporate... * testsuite/libgomp.oacc-c-c++-common/firstprivate-2.c: ... this file. * testsuite/libgomp.oacc-c++/template-reduction.C: New file. * testsuite/libgomp.oacc-c-c++-common/gang-static-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/gang-static-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-loop-clauses.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/private-variables.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise. * testsuite/libgomp.oacc-fortran/clauses-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/default-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/firstprivate-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/gang-static-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/if-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/implicit-firstprivate-ref.f90: Likewise. * testsuite/libgomp.oacc-fortran/pr68813.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-variables.f90: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-1.c: Merge this file... * testsuite/libgomp.oacc-c-c++-common/parallel-1.c: ..., and this file into... * testsuite/libgomp.oacc-c-c++-common/data-clauses.h: ... this new file. Update. * testsuite/libgomp.oacc-c-c++-common/data-clauses-kernels.c: New file. * testsuite/libgomp.oacc-c-c++-common/data-clauses-parallel.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-2.c: Rename to... * testsuite/libgomp.oacc-c-c++-common/data-clauses-kernels-ipa-pta.c: ... this new file. Update. * testsuite/libgomp.oacc-c-c++-common/parallel-2.c: Rename to... * testsuite/libgomp.oacc-c-c++-common/data-clauses-parallel-ipa-pta.c: ... this new file. Update. * testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: New file. Incorporate... * testsuite/libgomp.oacc-c-c++-common/worker-single-1a.c: ... this file, and... * testsuite/libgomp.oacc-c-c++-common/worker-single-4.c: ... this file, and... * testsuite/libgomp.oacc-c-c++-common/worker-single-6.c: ... this file. * testsuite/libgomp.oacc-c-c++-common/update-1-2.c: Remove file. Co-Authored-By: Cesar Philippidis <cesar@codesourcery.com> Co-Authored-By: Chung-Lin Tang <cltang@codesourcery.com> Co-Authored-By: James Norris <jnorris@codesourcery.com> Co-Authored-By: Julian Brown <julian@codesourcery.com> Co-Authored-By: Nathan Sidwell <nathan@codesourcery.com> Co-Authored-By: Tom de Vries <tom@codesourcery.com> From-SVN: r234575
896 lines
16 KiB
C
896 lines
16 KiB
C
/* Miscellaneous test cases for gang/worker/vector mode transitions. */
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <openacc.h>
|
|
|
|
|
|
/* Test basic vector-partitioned mode transitions. */
|
|
|
|
void t1()
|
|
{
|
|
int n = 0, arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(1) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
n++;
|
|
#pragma acc loop vector
|
|
for (j = 0; j < 32; j++)
|
|
arr[j]++;
|
|
n++;
|
|
}
|
|
|
|
assert (n == 2);
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test vector-partitioned, gang-partitioned mode. */
|
|
|
|
void t2()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test conditions inside vector-partitioned loops. */
|
|
|
|
void t4()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j, k;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
if ((arr[j * 32 + k] % 2) != 0)
|
|
arr[j * 32 + k] *= 2;
|
|
}
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
|
|
}
|
|
|
|
|
|
/* Test conditions inside gang-partitioned/vector-partitioned loops. */
|
|
|
|
void t5()
|
|
{
|
|
int n[32], arr[1024], i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(32) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
|
|
#pragma acc loop gang vector
|
|
for (j = 0; j < 1024; j++)
|
|
if ((arr[j] % 2) != 0)
|
|
arr[j] *= 2;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 32; j++)
|
|
n[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
|
|
}
|
|
|
|
|
|
/* Test trivial operation of vector-single mode. */
|
|
|
|
void t7()
|
|
{
|
|
int n = 0;
|
|
#pragma acc parallel copy(n) \
|
|
num_gangs(1) num_workers(1) vector_length(32)
|
|
{
|
|
n++;
|
|
}
|
|
assert (n == 1);
|
|
}
|
|
|
|
|
|
/* Test vector-single, gang-partitioned mode. */
|
|
|
|
void t8()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
arr[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test conditions in vector-single mode. */
|
|
|
|
void t9()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
if ((j % 3) == 0)
|
|
arr[j]++;
|
|
else
|
|
arr[j] += 2;
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == ((i % 3) == 0) ? 1 : 2);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test switch in vector-single mode. */
|
|
|
|
void t10()
|
|
{
|
|
int arr[1024];
|
|
int gangs;
|
|
|
|
for (gangs = 1; gangs <= 1024; gangs <<= 1)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(gangs) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 1024; j++)
|
|
switch (j % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == (i % 5) + 1);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test switch in vector-single mode, initialise array on device. */
|
|
|
|
void t11()
|
|
{
|
|
int arr[1024];
|
|
int i;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = 99;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(1024) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
/* This loop and the one following must be distributed to available gangs
|
|
in the same way to ensure data dependencies are not violated (hence the
|
|
"static" clauses). */
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 1024; j++)
|
|
arr[j] = 0;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < 1024; j++)
|
|
switch (j % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
assert (arr[i] == (i % 5) + 1);
|
|
}
|
|
|
|
|
|
/* Test multiple conditions in vector-single mode. */
|
|
|
|
#define NUM_GANGS 4096
|
|
void t12()
|
|
{
|
|
bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS];
|
|
int i;
|
|
|
|
#pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
|
|
num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
/* This loop and the one following must be distributed to available gangs
|
|
in the same way to ensure data dependencies are not violated (hence the
|
|
"static" clauses). */
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < NUM_GANGS; j++)
|
|
fizz[j] = buzz[j] = fizzbuzz[j] = 0;
|
|
|
|
#pragma acc loop gang(static:*)
|
|
for (j = 0; j < NUM_GANGS; j++)
|
|
{
|
|
if ((j % 3) == 0 && (j % 5) == 0)
|
|
fizzbuzz[j] = 1;
|
|
else
|
|
{
|
|
if ((j % 3) == 0)
|
|
fizz[j] = 1;
|
|
else if ((j % 5) == 0)
|
|
buzz[j] = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < NUM_GANGS; i++)
|
|
{
|
|
assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0));
|
|
assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0));
|
|
assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0));
|
|
}
|
|
}
|
|
#undef NUM_GANGS
|
|
|
|
|
|
/* Test worker-partitioned/vector-single mode. */
|
|
|
|
void t13()
|
|
{
|
|
int arr[32 * 8], i;
|
|
|
|
for (i = 0; i < 32 * 8; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 8; k++)
|
|
arr[j * 8 + k] += j * 8 + k;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 8; i++)
|
|
assert (arr[i] == i);
|
|
}
|
|
|
|
|
|
/* Test worker-single/worker-partitioned transitions. */
|
|
|
|
void t16()
|
|
{
|
|
int n[32], arr[32 * 32], i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr[i] = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) \
|
|
num_gangs(8) num_workers(16) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr[j * 32 + k]++;
|
|
|
|
n[j]++;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (n[i] == 4);
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr[i] == 3);
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between worker-partitioned loops. */
|
|
|
|
void t17()
|
|
{
|
|
int arr_a[32 * 32], arr_b[32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr_b[i] == (i ^ 31) * 8);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between worker+vector-partitioned loops. */
|
|
|
|
void t18()
|
|
{
|
|
int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
|
|
|
|
#pragma acc loop worker vector
|
|
for (k = 0; k < 32 * 32; k++)
|
|
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
assert (arr_b[i] == (i ^ 1023) * 8);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test correct synchronisation between vector-partitioned loops in
|
|
worker-partitioned mode. */
|
|
|
|
void t19()
|
|
{
|
|
int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
|
|
int num_workers, num_gangs;
|
|
|
|
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
|
|
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
|
|
{
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
arr_a[i] = i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
n[i] = 0;
|
|
|
|
#pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
|
|
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
|
|
#pragma acc loop worker
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
int m;
|
|
|
|
n[j * 32 + k]++;
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 2) == 0)
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 2;
|
|
else
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 3;
|
|
}
|
|
|
|
/* Test returning to vector-single mode... */
|
|
n[j * 32 + k]++;
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 3) == 0)
|
|
arr_a[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_b[j * 1024 + k * 32 + m] * 5;
|
|
else
|
|
arr_a[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_b[j * 1024 + k * 32 + m] * 7;
|
|
}
|
|
|
|
/* ...and back-to-back vector loops. */
|
|
|
|
#pragma acc loop vector
|
|
for (m = 0; m < 32; m++)
|
|
{
|
|
if (((j * 1024 + k * 32 + m) % 2) == 0)
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 3;
|
|
else
|
|
arr_b[j * 1024 + k * 32 + (31 - m)]
|
|
= arr_a[j * 1024 + k * 32 + m] * 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (n[i] == 2);
|
|
|
|
for (i = 0; i < 32 * 32 * 32; i++)
|
|
{
|
|
int m = 6 * ((i % 3) == 0 ? 5 : 7);
|
|
assert (arr_b[i] == (i ^ 31) * m);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* With -O0, variables are on the stack, not in registers. Check that worker
|
|
state propagation handles the stack frame. */
|
|
|
|
void t20()
|
|
{
|
|
int w0 = 0;
|
|
int w1 = 0;
|
|
int w2 = 0;
|
|
int w3 = 0;
|
|
int w4 = 0;
|
|
int w5 = 0;
|
|
int w6 = 0;
|
|
int w7 = 0;
|
|
|
|
int i;
|
|
|
|
#pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
|
|
num_gangs (1) num_workers (8)
|
|
{
|
|
int internal = 100;
|
|
|
|
#pragma acc loop worker
|
|
for (i = 0; i < 8; i++)
|
|
{
|
|
switch (i)
|
|
{
|
|
case 0: w0 = internal; break;
|
|
case 1: w1 = internal; break;
|
|
case 2: w2 = internal; break;
|
|
case 3: w3 = internal; break;
|
|
case 4: w4 = internal; break;
|
|
case 5: w5 = internal; break;
|
|
case 6: w6 = internal; break;
|
|
case 7: w7 = internal; break;
|
|
default: break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (w0 != 100
|
|
|| w1 != 100
|
|
|| w2 != 100
|
|
|| w3 != 100
|
|
|| w4 != 100
|
|
|| w5 != 100
|
|
|| w6 != 100
|
|
|| w7 != 100)
|
|
__builtin_abort ();
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-single mode. */
|
|
|
|
void t21()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
arr[j]++;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-single mode. */
|
|
|
|
void t22()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j]++;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == 1);
|
|
}
|
|
|
|
|
|
/* Test condition in worker-single/vector-single mode. */
|
|
|
|
void t23()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
if ((arr[j] % 2) != 0)
|
|
arr[j]++;
|
|
else
|
|
arr[j] += 2;
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
|
|
}
|
|
|
|
|
|
/* Test switch in worker-single/vector-single mode. */
|
|
|
|
void t24()
|
|
{
|
|
int arr[32], i;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
switch (arr[j] % 5)
|
|
{
|
|
case 0: arr[j] += 1; break;
|
|
case 1: arr[j] += 2; break;
|
|
case 2: arr[j] += 3; break;
|
|
case 3: arr[j] += 4; break;
|
|
case 4: arr[j] += 5; break;
|
|
default: arr[j] += 99;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == i + (i % 5) + 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single/vector-partitioned mode. */
|
|
|
|
void t25()
|
|
{
|
|
int arr[32 * 32], i;
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel copy(arr) \
|
|
num_gangs(8) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
int k;
|
|
#pragma acc loop vector
|
|
for (k = 0; k < 32; k++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j * 32 + k]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32 * 32; i++)
|
|
assert (arr[i] == i + 1);
|
|
}
|
|
|
|
|
|
/* Test worker-single, vector-partitioned, gang-redundant mode. */
|
|
|
|
#define ACTUAL_GANGS 8
|
|
void t27()
|
|
{
|
|
int n, arr[32], i;
|
|
int ondev;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
arr[i] = 0;
|
|
|
|
n = 0;
|
|
|
|
#pragma acc parallel copy(n, arr) copyout(ondev) \
|
|
num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
|
|
{
|
|
int j;
|
|
|
|
ondev = acc_on_device (acc_device_not_host);
|
|
|
|
#pragma acc atomic
|
|
n++;
|
|
|
|
#pragma acc loop vector
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc atomic
|
|
arr[j] += 1;
|
|
}
|
|
|
|
#pragma acc atomic
|
|
n++;
|
|
}
|
|
|
|
int m = ondev ? ACTUAL_GANGS : 1;
|
|
|
|
assert (n == m * 2);
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (arr[i] == m);
|
|
}
|
|
#undef ACTUAL_GANGS
|
|
|
|
|
|
/* Check if worker-single variables get broadcastd to vectors. */
|
|
|
|
#pragma acc routine
|
|
float t28_routine ()
|
|
{
|
|
return 2.71;
|
|
}
|
|
|
|
#define N 32
|
|
void t28()
|
|
{
|
|
float threads[N], v1 = 3.14;
|
|
|
|
for (int i = 0; i < N; i++)
|
|
threads[i] = -1;
|
|
|
|
#pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
|
|
{
|
|
float val = t28_routine ();
|
|
|
|
#pragma acc loop vector
|
|
for (int i = 0; i < N; i++)
|
|
threads[i] = val + v1*i;
|
|
}
|
|
|
|
for (int i = 0; i < N; i++)
|
|
assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001);
|
|
}
|
|
#undef N
|
|
|
|
|
|
int main()
|
|
{
|
|
t1();
|
|
t2();
|
|
t4();
|
|
t5();
|
|
t7();
|
|
t8();
|
|
t9();
|
|
t10();
|
|
t11();
|
|
t12();
|
|
t13();
|
|
t16();
|
|
t17();
|
|
t18();
|
|
t19();
|
|
t20();
|
|
t21();
|
|
t22();
|
|
t23();
|
|
t24();
|
|
t25();
|
|
t27();
|
|
t28();
|
|
|
|
return 0;
|
|
}
|