d43fbc7d3f
When running libgomp test-case broadcast-many.c on an nvptx accelerator
(T400, driver version 470.86), I run into:
...
libgomp: The Nvidia accelerator has insufficient resources to launch \
'main$_omp_fn$0' with num_workers = 32 and vector_length = 32; \
recompile the program with 'num_workers = x and vector_length = y' on \
that offloaded region or '-fopenacc-dim=❌y' where x * y <= 896.
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/broadcast-many.c \
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \
-O0 execution test
...
The error does not occur when using GOMP_NVPTX_JIT=-O0.
Fix this by using 896 / 32 == 28 workers for ACC_DEVICE_TYPE_nvidia.
Likewise for some other test-cases.
Tested libgomp on x86_64 with nvptx accelerator.
libgomp/ChangeLog:
2022-01-27 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.oacc-c-c++-common/broadcast-many.c: Reduce
num_workers for nvidia accelerator to fix libgomp error 'insufficient
resources'.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c:
Same.
* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Same.
495 lines
11 KiB
C
495 lines
11 KiB
C
/* Tests of reduction on loop directive. */
|
|
|
|
/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
|
|
aspects of that functionality. */
|
|
|
|
#include <assert.h>
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs, non-private reduction
|
|
variable). */
|
|
|
|
void g_np_1()
|
|
{
|
|
int i, arr[1024], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
|
|
/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
|
|
{
|
|
#pragma acc loop gang reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[i];
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
|
|
res = hres = 1;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
|
|
/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
|
|
{
|
|
#pragma acc loop gang reduction(*:res)
|
|
for (i = 0; i < 12; i++)
|
|
res *= arr[i];
|
|
}
|
|
|
|
for (i = 0; i < 12; i++)
|
|
hres *= arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs and vectors, non-private
|
|
reduction variable). */
|
|
|
|
void gv_np_1()
|
|
{
|
|
int i, arr[1024], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
|
|
{
|
|
#pragma acc loop gang vector reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[i];
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs and workers, non-private
|
|
reduction variable). */
|
|
|
|
void gw_np_1()
|
|
{
|
|
int i, arr[1024], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 } */
|
|
{
|
|
#pragma acc loop gang worker reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[i];
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
|
|
reduction variable). */
|
|
|
|
void gwv_np_1()
|
|
{
|
|
int i, arr[1024], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
{
|
|
#pragma acc loop gang worker vector reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[i];
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
|
|
reduction variable: separate gang and worker/vector loops). */
|
|
|
|
void gwv_np_2()
|
|
{
|
|
int i, j, arr[32768], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
|
|
{
|
|
#pragma acc loop gang reduction(+:res)
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc loop worker vector reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[j * 1024 + i];
|
|
}
|
|
/* "res" is non-private, and is not available until after the parallel
|
|
region. */
|
|
}
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
|
|
reduction variable: separate gang and worker/vector loops). */
|
|
|
|
void gwv_np_3()
|
|
{
|
|
int i, j;
|
|
double arr[32768], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
copyin(arr)
|
|
{
|
|
#pragma acc loop gang reduction(+:res)
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc loop worker vector reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[j * 1024 + i];
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
hres += arr[i];
|
|
|
|
assert (res == hres);
|
|
}
|
|
|
|
#if ACC_DEVICE_TYPE_nvidia
|
|
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
|
|
#define NUM_WORKERS 28
|
|
#else
|
|
#define NUM_WORKERS 32
|
|
#endif
|
|
|
|
/* Test of reduction on loop directive (gangs, workers and vectors, multiple
|
|
non-private reduction variables, float type). */
|
|
|
|
void gwv_np_4()
|
|
{
|
|
int i, j;
|
|
float arr[32768];
|
|
float res = 0, mres = 0, hres = 0, hmres = 0;
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
arr[i] = i % (32768 / 64);
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(NUM_WORKERS) vector_length(32)
|
|
{
|
|
#pragma acc loop gang reduction(+:res) reduction(max:mres)
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
|
|
for (i = 0; i < 1024; i++)
|
|
{
|
|
res += arr[j * 1024 + i];
|
|
if (arr[j * 1024 + i] > mres)
|
|
mres = arr[j * 1024 + i];
|
|
}
|
|
|
|
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
|
|
for (i = 0; i < 1024; i++)
|
|
{
|
|
res += arr[j * 1024 + (1023 - i)];
|
|
if (arr[j * 1024 + (1023 - i)] > mres)
|
|
mres = arr[j * 1024 + (1023 - i)];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
for (i = 0; i < 1024; i++)
|
|
{
|
|
hres += arr[j * 1024 + i];
|
|
hres += arr[j * 1024 + (1023 - i)];
|
|
if (arr[j * 1024 + i] > hmres)
|
|
hmres = arr[j * 1024 + i];
|
|
if (arr[j * 1024 + (1023 - i)] > hmres)
|
|
hmres = arr[j * 1024 + (1023 - i)];
|
|
}
|
|
|
|
assert (hres <= 16777216);
|
|
assert (res == hres);
|
|
|
|
assert (hmres <= 16777216);
|
|
assert (mres == hmres);
|
|
}
|
|
|
|
#undef NUM_WORKERS
|
|
|
|
/* Test of reduction on loop directive (vectors, private reduction
|
|
variable). */
|
|
|
|
void v_p_1()
|
|
{
|
|
int i, j, arr[1024], out[32], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(res) copyout(out)
|
|
/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
|
|
{
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
res = 0;
|
|
|
|
#pragma acc loop vector reduction(+:res)
|
|
for (i = 0; i < 32; i++)
|
|
res += arr[j * 32 + i];
|
|
|
|
out[j] = res;
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
hres = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
hres += arr[j * 32 + i];
|
|
|
|
assert (out[j] == hres);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (vector reduction in
|
|
gang-partitioned/worker-partitioned mode, private reduction variable). */
|
|
|
|
void v_p_2()
|
|
{
|
|
int i, j, k;
|
|
double ina[1024], inb[1024], out[1024], acc;
|
|
|
|
for (j = 0; j < 32; j++)
|
|
for (i = 0; i < 32; i++)
|
|
{
|
|
ina[j * 32 + i] = (i == j) ? 2.0 : 0.0;
|
|
inb[j * 32 + i] = (double) (i + j);
|
|
}
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(acc) copyin(ina, inb) copyout(out)
|
|
{
|
|
#pragma acc loop gang worker
|
|
for (k = 0; k < 32; k++)
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
acc = 0;
|
|
|
|
#pragma acc loop vector reduction(+:acc)
|
|
for (i = 0; i < 32; i++)
|
|
acc += ina[k * 32 + i] * inb[i * 32 + j];
|
|
|
|
out[k * 32 + j] = acc;
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
for (i = 0; i < 32; i++)
|
|
assert (out[j * 32 + i] == (i + j) * 2);
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (workers, private reduction
|
|
variable). */
|
|
|
|
void w_p_1()
|
|
{
|
|
int i, j, arr[1024], out[32], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(res) copyout(out)
|
|
/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
|
|
{
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
res = 0;
|
|
|
|
#pragma acc loop worker reduction(+:res)
|
|
for (i = 0; i < 32; i++)
|
|
res += arr[j * 32 + i];
|
|
|
|
out[j] = res;
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
hres = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
hres += arr[j * 32 + i];
|
|
|
|
assert (out[j] == hres);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (workers and vectors, private reduction
|
|
variable). */
|
|
|
|
void wv_p_1()
|
|
{
|
|
int i, j, arr[1024], out[32], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(res) copyout(out)
|
|
{
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
res = 0;
|
|
|
|
#pragma acc loop worker vector reduction(+:res)
|
|
for (i = 0; i < 32; i++)
|
|
res += arr[j * 32 + i];
|
|
|
|
out[j] = res;
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
hres = 0;
|
|
|
|
for (i = 0; i < 32; i++)
|
|
hres += arr[j * 32 + i];
|
|
|
|
assert (out[j] == hres);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (workers and vectors, private reduction
|
|
variable). */
|
|
|
|
void wv_p_2()
|
|
{
|
|
int i, j, arr[32768], out[32], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 32768; i++)
|
|
arr[i] = i;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(res) copyout(out)
|
|
{
|
|
#pragma acc loop gang
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
res = j;
|
|
|
|
#pragma acc loop worker reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[j * 1024 + i];
|
|
|
|
#pragma acc loop vector reduction(+:res)
|
|
for (i = 1023; i >= 0; i--)
|
|
res += arr[j * 1024 + i];
|
|
|
|
out[j] = res;
|
|
}
|
|
}
|
|
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
hres = j;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[j * 1024 + i] * 2;
|
|
|
|
assert (out[j] == hres);
|
|
}
|
|
}
|
|
|
|
|
|
/* Test of reduction on loop directive (workers and vectors, private reduction
|
|
variable: gang-redundant mode). */
|
|
|
|
void wv_p_3()
|
|
{
|
|
int i, arr[1024], out[32], res = 0, hres = 0;
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
arr[i] = i ^ 33;
|
|
|
|
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
|
|
private(res) copyin(arr) copyout(out)
|
|
{
|
|
/* Private variables aren't initialized by default in openacc. */
|
|
res = 0;
|
|
|
|
/* "res" should be available at the end of the following loop (and should
|
|
have the same value redundantly in each gang). */
|
|
#pragma acc loop worker vector reduction(+:res)
|
|
for (i = 0; i < 1024; i++)
|
|
res += arr[i];
|
|
|
|
#pragma acc loop gang (static: 1)
|
|
for (i = 0; i < 32; i++)
|
|
out[i] = res;
|
|
}
|
|
|
|
for (i = 0; i < 1024; i++)
|
|
hres += arr[i];
|
|
|
|
for (i = 0; i < 32; i++)
|
|
assert (out[i] == hres);
|
|
}
|
|
|
|
|
|
int main()
|
|
{
|
|
g_np_1();
|
|
gv_np_1();
|
|
gw_np_1();
|
|
gwv_np_1();
|
|
gwv_np_2();
|
|
gwv_np_3();
|
|
gwv_np_4();
|
|
v_p_1();
|
|
v_p_2();
|
|
w_p_1();
|
|
wv_p_1();
|
|
wv_p_2();
|
|
wv_p_3();
|
|
|
|
return 0;
|
|
}
|