From d43fbc7d3f7621e1c8f153c1471d2a5cd20bfdc8 Mon Sep 17 00:00:00 2001 From: Tom de Vries Date: Sun, 23 Jan 2022 06:29:58 +0100 Subject: [PATCH] [libgomp, testsuite] Fix insufficient resources in test-cases When running libgomp test-case broadcast-many.c on an nvptx accelerator (T400, driver version 470.86), I run into: ... libgomp: The Nvidia accelerator has insufficient resources to launch \ 'main$_omp_fn$0' with num_workers = 32 and vector_length = 32; \ recompile the program with 'num_workers = x and vector_length = y' on \ that offloaded region or '-fopenacc-dim=:x:y' where x * y <= 896. FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/broadcast-many.c \ -DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none \ -O0 execution test ... The error does not occur when using GOMP_NVPTX_JIT=-O0. Fix this by using 896 / 32 == 28 workers for ACC_DEVICE_TYPE_nvidia. Likewise for some other test-cases. Tested libgomp on x86_64 with nvptx accelerator. libgomp/ChangeLog: 2022-01-27 Tom de Vries * testsuite/libgomp.oacc-c-c++-common/broadcast-many.c: Reduce num_workers for nvidia accelerator to fix libgomp error 'insufficient resources'. * testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c: Same. * testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Same. --- .../libgomp.oacc-c-c++-common/broadcast-many.c | 9 ++++++++- .../par-loop-comb-reduction-4.c | 10 +++++++++- .../testsuite/libgomp.oacc-c-c++-common/reduction-7.c | 9 ++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c index 37839edfb09..08e026960e6 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c @@ -5,6 +5,13 @@ #include #include +#if ACC_DEVICE_TYPE_nvidia +/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */ +#define NUM_WORKERS 28 +#else +#define NUM_WORKERS 32 +#endif + #define LOCAL(n) double n = input; #define LOCALS(n) LOCAL(n##1) LOCAL(n##2) LOCAL(n##3) LOCAL(n##4) \ LOCAL(n##5) LOCAL(n##6) LOCAL(n##7) LOCAL(n##8) @@ -23,7 +30,7 @@ int main (void) int ret; int input = 1; - #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret) + #pragma acc parallel num_gangs(1) num_workers(NUM_WORKERS) copyout(ret) { int w = 0; LOCALS2(h); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c index c3cc12fa953..4c66dc7bfea 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c @@ -1,5 +1,12 @@ #include +#if ACC_DEVICE_TYPE_nvidia +/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */ +#define NUM_WORKERS 24 +#else +#define NUM_WORKERS 32 +#endif + /* Test of reduction on both parallel and loop directives (workers and vectors together in gang-partitioned mode, float type, multiple reductions). */ @@ -13,7 +20,8 @@ main (int argc, char *argv[]) for (i = 0; i < 32768; i++) arr[i] = i % (32768 / 64); - #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \ + #pragma acc parallel \ + num_gangs(32) num_workers(NUM_WORKERS) vector_length(32) \ reduction(+:res) reduction(max:mres) copy(res, mres) { #pragma acc loop gang /* { dg-warning "nested loop in reduction needs reduction clause for 'm\?res'" "TODO" } */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c index c2fb922a7f1..b4fe2300581 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c @@ -181,6 +181,12 @@ void gwv_np_3() assert (res == hres); } +#if ACC_DEVICE_TYPE_nvidia +/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */ +#define NUM_WORKERS 28 +#else +#define NUM_WORKERS 32 +#endif /* Test of reduction on loop directive (gangs, workers and vectors, multiple non-private reduction variables, float type). */ @@ -194,7 +200,7 @@ void gwv_np_4() for (i = 0; i < 32768; i++) arr[i] = i % (32768 / 64); - #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) + #pragma acc parallel num_gangs(32) num_workers(NUM_WORKERS) vector_length(32) { #pragma acc loop gang reduction(+:res) reduction(max:mres) for (j = 0; j < 32; j++) @@ -235,6 +241,7 @@ void gwv_np_4() assert (mres == hmres); } +#undef NUM_WORKERS /* Test of reduction on loop directive (vectors, private reduction variable). */