gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
Cesar Philippidis 31dd69b7ff Update OpenACC testcases
gcc/testsuite/
	* c-c++-common/goacc/deviceptr-4.c: New file.
	* c-c++-common/goacc/kernels-counter-var-redundant-load.c:
	Likewise.
	* c-c++-common/goacc/kernels-loop-data-2.c: Likewise.
	* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: Likewise.
	* c-c++-common/goacc/kernels-loop-data-enter-exit.c: Likewise.
	* c-c++-common/goacc/kernels-loop-data-update.c: Likewise.
	* c-c++-common/goacc/kernels-loop-data.c: Likewise.
	* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c:
	Likewise.
	* c-c++-common/goacc/parallel-reduction.c: Likewise.
	* c-c++-common/goacc/private-reduction-1.c: Likewise.
	* gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95:
	Likewise.
	* gfortran.dg/goacc/modules.f95: Likewise.
	* gfortran.dg/goacc/routine-8.f90: Likewise.
	* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.
	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Don't force "-O2".
	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update.
	* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise.
	* testsuite/libgomp.oacc-fortran/data-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/data-2.f90: Likewise.
	* testsuite/libgomp.oacc-c++/non-scalar-data.C: New file.
	* testsuite/libgomp.oacc-c-c++-common/declare-3.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/enter-data.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-3.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-4.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-local-worker-5.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-3.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-4.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-5.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-gang-6.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-vector-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-2.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-3.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-4.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-5.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-6.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-private-vars-loop-worker-7.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-1.h: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-loop-2.h: Likewise.
	* testsuite/libgomp.oacc-fortran/cublas-fixed.h: Likewise.
	* testsuite/libgomp.oacc-fortran/dummy-array.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/host_data-3.f: Likewise.
	* testsuite/libgomp.oacc-fortran/host_data-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction-2.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-acc-loop-reduction.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-collapse-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-collapse-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-independent.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-loop-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-map-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-1.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-2.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-3.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-gang-6.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-1.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-vector-2.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-1.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-2.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-3.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-4.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-5.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-6.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-private-vars-loop-worker-7.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-13.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-14.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/lib-15.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/parallel-loop-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reference-reductions.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/vector-routine.f90: Likewise.

Co-Authored-By: James Norris <jnorris@codesourcery.com>
Co-Authored-By: Julian Brown <julian@codesourcery.com>
Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
Co-Authored-By: Tom de Vries <tom@codesourcery.com>

From-SVN: r261884
2018-06-22 12:04:14 +02:00

1166 lines
21 KiB
C

/* Miscellaneous test cases for gang/worker/vector mode transitions. */
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include <math.h>
#include <openacc.h>
/* Test basic vector-partitioned mode transitions. */
void t1()
{
int n = 0, arr[32], i;
for (i = 0; i < 32; i++)
arr[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(1) num_workers(1) vector_length(32)
{
int j;
n++;
#pragma acc loop vector
for (j = 0; j < 32; j++)
arr[j]++;
n++;
}
assert (n == 2);
for (i = 0; i < 32; i++)
assert (arr[i] == 1);
}
/* Test vector-partitioned, gang-partitioned mode. */
void t2()
{
int n[32], arr[1024], i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
for (i = 0; i < 32; i++)
n[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(32) num_workers(1) vector_length(32)
{
int j, k;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
#pragma acc loop gang
for (j = 0; j < 32; j++)
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k]++;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
}
for (i = 0; i < 32; i++)
assert (n[i] == 2);
for (i = 0; i < 1024; i++)
assert (arr[i] == 1);
}
/* Test conditional vector-partitioned loops. */
void t3()
{
int n[32], arr[1024], i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
for (i = 0; i < 32; i++)
n[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(32) num_workers(1) vector_length(32)
{
int j, k;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
if ((j % 2) == 0)
{
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k]++;
}
else
{
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k]--;
}
}
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
}
for (i = 0; i < 32; i++)
assert (n[i] == 2);
for (i = 0; i < 1024; i++)
assert (arr[i] == ((i % 64) < 32) ? 1 : -1);
}
/* Test conditions inside vector-partitioned loops. */
void t4()
{
int n[32], arr[1024], i;
for (i = 0; i < 1024; i++)
arr[i] = i;
for (i = 0; i < 32; i++)
n[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(32) num_workers(1) vector_length(32)
{
int j, k;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc loop vector
for (k = 0; k < 32; k++)
if ((arr[j * 32 + k] % 2) != 0)
arr[j * 32 + k] *= 2;
}
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
}
for (i = 0; i < 32; i++)
assert (n[i] == 2);
for (i = 0; i < 1024; i++)
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
}
/* Test conditions inside gang-partitioned/vector-partitioned loops. */
void t5()
{
int n[32], arr[1024], i;
for (i = 0; i < 1024; i++)
arr[i] = i;
for (i = 0; i < 32; i++)
n[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(32) num_workers(1) vector_length(32)
{
int j;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
#pragma acc loop gang vector
for (j = 0; j < 1024; j++)
if ((arr[j] % 2) != 0)
arr[j] *= 2;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
}
for (i = 0; i < 32; i++)
assert (n[i] == 2);
for (i = 0; i < 1024; i++)
assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
}
/* Test switch containing vector-partitioned loops inside gang-partitioned
loops. */
void t6()
{
int n[32], arr[1024], i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
for (i = 0; i < 32; i++)
n[i] = i % 5;
#pragma acc parallel copy(n, arr) \
num_gangs(32) num_workers(1) vector_length(32)
{
int j, k;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
switch (n[j])
{
case 1:
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k] += 1;
break;
case 2:
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k] += 2;
break;
case 3:
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k] += 3;
break;
case 4:
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k] += 4;
break;
case 5:
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[j * 32 + k] += 5;
break;
default:
abort ();
}
#pragma acc loop gang(static:*)
for (j = 0; j < 32; j++)
n[j]++;
}
for (i = 0; i < 32; i++)
assert (n[i] == (i % 5) + 2);
for (i = 0; i < 1024; i++)
assert (arr[i] == ((i / 32) % 5) + 1);
}
/* Test trivial operation of vector-single mode. */
void t7()
{
int n = 0;
#pragma acc parallel copy(n) \
num_gangs(1) num_workers(1) vector_length(32)
{
n++;
}
assert (n == 1);
}
/* Test vector-single, gang-partitioned mode. */
void t8()
{
int arr[1024];
int gangs;
for (gangs = 1; gangs <= 1024; gangs <<= 1)
{
int i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(gangs) num_workers(1) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 1024; j++)
arr[j]++;
}
for (i = 0; i < 1024; i++)
assert (arr[i] == 1);
}
}
/* Test conditions in vector-single mode. */
void t9()
{
int arr[1024];
int gangs;
for (gangs = 1; gangs <= 1024; gangs <<= 1)
{
int i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(gangs) num_workers(1) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 1024; j++)
if ((j % 3) == 0)
arr[j]++;
else
arr[j] += 2;
}
for (i = 0; i < 1024; i++)
assert (arr[i] == ((i % 3) == 0) ? 1 : 2);
}
}
/* Test switch in vector-single mode. */
void t10()
{
int arr[1024];
int gangs;
for (gangs = 1; gangs <= 1024; gangs <<= 1)
{
int i;
for (i = 0; i < 1024; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(gangs) num_workers(1) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 1024; j++)
switch (j % 5)
{
case 0: arr[j] += 1; break;
case 1: arr[j] += 2; break;
case 2: arr[j] += 3; break;
case 3: arr[j] += 4; break;
case 4: arr[j] += 5; break;
default: arr[j] += 99;
}
}
for (i = 0; i < 1024; i++)
assert (arr[i] == (i % 5) + 1);
}
}
/* Test switch in vector-single mode, initialise array on device. */
void t11()
{
int arr[1024];
int i;
for (i = 0; i < 1024; i++)
arr[i] = 99;
#pragma acc parallel copy(arr) \
num_gangs(1024) num_workers(1) vector_length(32)
{
int j;
/* This loop and the one following must be distributed to available gangs
in the same way to ensure data dependencies are not violated (hence the
"static" clauses). */
#pragma acc loop gang(static:*)
for (j = 0; j < 1024; j++)
arr[j] = 0;
#pragma acc loop gang(static:*)
for (j = 0; j < 1024; j++)
switch (j % 5)
{
case 0: arr[j] += 1; break;
case 1: arr[j] += 2; break;
case 2: arr[j] += 3; break;
case 3: arr[j] += 4; break;
case 4: arr[j] += 5; break;
default: arr[j] += 99;
}
}
for (i = 0; i < 1024; i++)
assert (arr[i] == (i % 5) + 1);
}
/* Test multiple conditions in vector-single mode. */
#define NUM_GANGS 4096
void t12()
{
bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS];
int i;
#pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
{
int j;
/* This loop and the one following must be distributed to available gangs
in the same way to ensure data dependencies are not violated (hence the
"static" clauses). */
#pragma acc loop gang(static:*)
for (j = 0; j < NUM_GANGS; j++)
fizz[j] = buzz[j] = fizzbuzz[j] = 0;
#pragma acc loop gang(static:*)
for (j = 0; j < NUM_GANGS; j++)
{
if ((j % 3) == 0 && (j % 5) == 0)
fizzbuzz[j] = 1;
else
{
if ((j % 3) == 0)
fizz[j] = 1;
else if ((j % 5) == 0)
buzz[j] = 1;
}
}
}
for (i = 0; i < NUM_GANGS; i++)
{
assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0));
assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0));
assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0));
}
}
#undef NUM_GANGS
/* Test worker-partitioned/vector-single mode. */
void t13()
{
int arr[32 * 8], i;
for (i = 0; i < 32 * 8; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker
for (k = 0; k < 8; k++)
arr[j * 8 + k] += j * 8 + k;
}
}
for (i = 0; i < 32 * 8; i++)
assert (arr[i] == i);
}
/* Test condition in worker-partitioned mode. */
void t14()
{
int arr[32 * 32 * 8], i;
for (i = 0; i < 32 * 32 * 8; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker
for (k = 0; k < 8; k++)
{
int m;
if ((k % 2) == 0)
{
#pragma acc loop vector
for (m = 0; m < 32; m++)
arr[j * 32 * 8 + k * 32 + m]++;
}
else
{
#pragma acc loop vector
for (m = 0; m < 32; m++)
arr[j * 32 * 8 + k * 32 + m] += 2;
}
}
}
}
for (i = 0; i < 32 * 32 * 8; i++)
assert (arr[i] == i + ((i / 32) % 2) + 1);
}
/* Test switch in worker-partitioned mode. */
void t15()
{
int arr[32 * 32 * 8], i;
for (i = 0; i < 32 * 32 * 8; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker
for (k = 0; k < 8; k++)
{
int m;
switch ((j * 32 + k) % 3)
{
case 0:
#pragma acc loop vector
for (m = 0; m < 32; m++)
arr[j * 32 * 8 + k * 32 + m]++;
break;
case 1:
#pragma acc loop vector
for (m = 0; m < 32; m++)
arr[j * 32 * 8 + k * 32 + m] += 2;
break;
case 2:
#pragma acc loop vector
for (m = 0; m < 32; m++)
arr[j * 32 * 8 + k * 32 + m] += 3;
break;
default: ;
}
}
}
}
for (i = 0; i < 32 * 32 * 8; i++)
assert (arr[i] == i + ((i / 32) % 3) + 1);
}
/* Test worker-single/worker-partitioned transitions. */
void t16()
{
int n[32], arr[32 * 32], i;
for (i = 0; i < 32 * 32; i++)
arr[i] = 0;
for (i = 0; i < 32; i++)
n[i] = 0;
#pragma acc parallel copy(n, arr) \
num_gangs(8) num_workers(16) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
n[j]++;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr[j * 32 + k]++;
n[j]++;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr[j * 32 + k]++;
n[j]++;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr[j * 32 + k]++;
n[j]++;
}
}
for (i = 0; i < 32; i++)
assert (n[i] == 4);
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == 3);
}
/* Test correct synchronisation between worker-partitioned loops. */
void t17()
{
int arr_a[32 * 32], arr_b[32 * 32], i;
int num_workers, num_gangs;
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
{
for (i = 0; i < 32 * 32; i++)
arr_a[i] = i;
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
#pragma acc loop worker
for (k = 0; k < 32; k++)
arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr_b[i] == (i ^ 31) * 8);
}
}
/* Test correct synchronisation between worker+vector-partitioned loops. */
void t18()
{
int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
int num_workers, num_gangs;
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
{
for (i = 0; i < 32 * 32 * 32; i++)
arr_a[i] = i;
#pragma acc parallel copyin(arr_a) copyout(arr_b) \
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker vector
for (k = 0; k < 32 * 32; k++)
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
#pragma acc loop worker vector
for (k = 0; k < 32 * 32; k++)
arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
#pragma acc loop worker vector
for (k = 0; k < 32 * 32; k++)
arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
}
}
for (i = 0; i < 32 * 32 * 32; i++)
assert (arr_b[i] == (i ^ 1023) * 8);
}
}
/* Test correct synchronisation between vector-partitioned loops in
worker-partitioned mode. */
void t19()
{
int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
int num_workers, num_gangs;
for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
{
for (i = 0; i < 32 * 32 * 32; i++)
arr_a[i] = i;
for (i = 0; i < 32 * 32; i++)
n[i] = 0;
#pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop worker
for (k = 0; k < 32; k++)
{
int m;
n[j * 32 + k]++;
#pragma acc loop vector
for (m = 0; m < 32; m++)
{
if (((j * 1024 + k * 32 + m) % 2) == 0)
arr_b[j * 1024 + k * 32 + (31 - m)]
= arr_a[j * 1024 + k * 32 + m] * 2;
else
arr_b[j * 1024 + k * 32 + (31 - m)]
= arr_a[j * 1024 + k * 32 + m] * 3;
}
/* Test returning to vector-single mode... */
n[j * 32 + k]++;
#pragma acc loop vector
for (m = 0; m < 32; m++)
{
if (((j * 1024 + k * 32 + m) % 3) == 0)
arr_a[j * 1024 + k * 32 + (31 - m)]
= arr_b[j * 1024 + k * 32 + m] * 5;
else
arr_a[j * 1024 + k * 32 + (31 - m)]
= arr_b[j * 1024 + k * 32 + m] * 7;
}
/* ...and back-to-back vector loops. */
#pragma acc loop vector
for (m = 0; m < 32; m++)
{
if (((j * 1024 + k * 32 + m) % 2) == 0)
arr_b[j * 1024 + k * 32 + (31 - m)]
= arr_a[j * 1024 + k * 32 + m] * 3;
else
arr_b[j * 1024 + k * 32 + (31 - m)]
= arr_a[j * 1024 + k * 32 + m] * 2;
}
}
}
}
for (i = 0; i < 32 * 32; i++)
assert (n[i] == 2);
for (i = 0; i < 32 * 32 * 32; i++)
{
int m = 6 * ((i % 3) == 0 ? 5 : 7);
assert (arr_b[i] == (i ^ 31) * m);
}
}
}
/* With -O0, variables are on the stack, not in registers. Check that worker
state propagation handles the stack frame. */
void t20()
{
int w0 = 0;
int w1 = 0;
int w2 = 0;
int w3 = 0;
int w4 = 0;
int w5 = 0;
int w6 = 0;
int w7 = 0;
int i;
#pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
num_gangs (1) num_workers (8)
{
int internal = 100;
#pragma acc loop worker
for (i = 0; i < 8; i++)
{
switch (i)
{
case 0: w0 = internal; break;
case 1: w1 = internal; break;
case 2: w2 = internal; break;
case 3: w3 = internal; break;
case 4: w4 = internal; break;
case 5: w5 = internal; break;
case 6: w6 = internal; break;
case 7: w7 = internal; break;
default: break;
}
}
}
if (w0 != 100
|| w1 != 100
|| w2 != 100
|| w3 != 100
|| w4 != 100
|| w5 != 100
|| w6 != 100
|| w7 != 100)
__builtin_abort ();
}
/* Test worker-single/vector-single mode. */
void t21()
{
int arr[32], i;
for (i = 0; i < 32; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
arr[j]++;
}
for (i = 0; i < 32; i++)
assert (arr[i] == 1);
}
/* Test worker-single/vector-single mode. */
void t22()
{
int arr[32], i;
for (i = 0; i < 32; i++)
arr[i] = 0;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc atomic
arr[j]++;
}
}
for (i = 0; i < 32; i++)
assert (arr[i] == 1);
}
/* Test condition in worker-single/vector-single mode. */
void t23()
{
int arr[32], i;
for (i = 0; i < 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
if ((arr[j] % 2) != 0)
arr[j]++;
else
arr[j] += 2;
}
for (i = 0; i < 32; i++)
assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
}
/* Test switch in worker-single/vector-single mode. */
void t24()
{
int arr[32], i;
for (i = 0; i < 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
switch (arr[j] % 5)
{
case 0: arr[j] += 1; break;
case 1: arr[j] += 2; break;
case 2: arr[j] += 3; break;
case 3: arr[j] += 4; break;
case 4: arr[j] += 5; break;
default: arr[j] += 99;
}
}
for (i = 0; i < 32; i++)
assert (arr[i] == i + (i % 5) + 1);
}
/* Test worker-single/vector-partitioned mode. */
void t25()
{
int arr[32 * 32], i;
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop vector
for (k = 0; k < 32; k++)
{
#pragma acc atomic
arr[j * 32 + k]++;
}
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + 1);
}
/* Test multiple conditional vector-partitioned loops in worker-single
mode. */
void t26()
{
int arr[32 * 32], i;
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) \
num_gangs(8) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
int k;
if ((j % 3) == 0)
{
#pragma acc loop vector
for (k = 0; k < 32; k++)
{
#pragma acc atomic
arr[j * 32 + k] += 3;
}
}
else if ((j % 3) == 1)
{
#pragma acc loop vector
for (k = 0; k < 32; k++)
{
#pragma acc atomic
arr[j * 32 + k] += 7;
}
}
}
}
for (i = 0; i < 32 * 32; i++)
{
int j = (i / 32) % 3;
assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
}
}
/* Test worker-single, vector-partitioned, gang-redundant mode. */
#define ACTUAL_GANGS 8
void t27()
{
int n, arr[32], i;
int ondev;
for (i = 0; i < 32; i++)
arr[i] = 0;
n = 0;
#pragma acc parallel copy(n, arr) copyout(ondev) \
num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
{
int j;
ondev = acc_on_device (acc_device_not_host);
#pragma acc atomic
n++;
#pragma acc loop vector
for (j = 0; j < 32; j++)
{
#pragma acc atomic
arr[j] += 1;
}
#pragma acc atomic
n++;
}
int m = ondev ? ACTUAL_GANGS : 1;
assert (n == m * 2);
for (i = 0; i < 32; i++)
assert (arr[i] == m);
}
#undef ACTUAL_GANGS
/* Check if worker-single variables get broadcastd to vectors. */
#pragma acc routine
float t28_routine ()
{
return 2.71;
}
#define N 32
void t28()
{
float threads[N], v1 = 3.14;
for (int i = 0; i < N; i++)
threads[i] = -1;
#pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
{
float val = t28_routine ();
#pragma acc loop vector
for (int i = 0; i < N; i++)
threads[i] = val + v1*i;
}
for (int i = 0; i < N; i++)
assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001);
}
#undef N
int main()
{
t1();
t2();
t3();
t4();
t5();
t6();
t7();
t8();
t9();
t10();
t11();
t12();
t13();
t14();
t15();
t16();
t17();
t18();
t19();
t20();
t21();
t22();
t23();
t24();
t25();
t26();
t27();
t28();
return 0;
}