Strengthen a few OpenACC test cases

Rather than rubber-stamp whatever requested vs. actual device kernel launch configuration happens, actually (again) verify the requested values (modulo expected variations). This better highlights that "AMD GCN has an upper limit of 'num_workers(16)'", and the deficiency that "AMD GCN uses the autovectorizer for the vector dimension: the use of a function call in vector-partitioned code [...] is not currently supported". And, this removes several instances of race conditions, where variables are concurrently written to in OpenACC gang-redundant mode. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Strengthen. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
2022-01-21 12:48:28 +01:00 · 2022-01-21 12:48:28 +01:00 · 087e545747
parent 23be9f83bb
commit 087e545747
15 changed files with 202 additions and 57 deletions
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@ -19,9 +19,12 @@ int main ()

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
-  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
-	    copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
+
+#define NG 32
+#define NW 32
+#define VL 32
+#pragma acc parallel num_gangs(NG) num_workers(NW) vector_length(VL) \
+	    copy(ary) copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop gang worker vector
@ -45,11 +48,19 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
-
-    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  gangsize = NG;
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
@ -46,14 +46,17 @@ int main ()
  int ary[N];
  int ix;
  int exit = 0;
-  int gangsize = 0, workersize = 0, vectorsize = 0;
+  int gangsize, workersize, vectorsize;
  int *gangdist, *workerdist, *vectordist;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;

-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
-	    copy(ary) copyout(gangsize, workersize, vectorsize)
+#define NG 32
+#define NW 32
+#define VL 32
+#pragma acc parallel num_gangs(NG) num_workers(NW) vector_length(VL) \
+	    copy(ary)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop gang worker vector
@ -71,11 +74,23 @@ int main ()

 	ary[ix] = (g << 16) | (w << 8) | v;
      }
-
-    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  gangsize = NG;
+  workersize = NW;
+  vectorsize = VL;
+#if defined ACC_DEVICE_TYPE_host
+  gangsize = 1;
+  workersize = 1;
+  vectorsize = 1;
+#elif defined ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  gangdist = (int *) __builtin_alloca (gangsize * sizeof (int));
  workerdist = (int *) __builtin_alloca (workersize * sizeof (int));
@ -92,6 +107,11 @@ int main ()
      int w = (ary[ix] >> 8) & 255;
      int v = ary[ix] & 255;

+      if (g >= gangsize
+	  || w >= workersize
+	  || v >= vectorsize)
+	__builtin_abort ();
+
      gangdist[g]++;
      workerdist[w]++;
      vectordist[v]++;
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@ -16,8 +16,11 @@ int main ()
  int t = 0, h = 0;
  int gangsize, workersize, vectorsize;

-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
-	copy(ondev) copyout(gangsize, workersize, vectorsize)
+#define NG 32
+#define NW 32
+#define VL 32
+#pragma acc parallel num_gangs(NG) num_workers(NW) vector_length(VL) \
+	copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop gang worker vector reduction(+:t)
@ -42,10 +45,19 @@ int main ()
 	  }
 	t += val;
      }
-    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  gangsize = NG;
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
@ -17,7 +17,8 @@ int main ()
  int t = 0,  h = 0;
  int vectorsize;

-#pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
+#define VL 32
+#pragma acc parallel vector_length(VL) copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-1 } */
  {
 #pragma acc loop vector reduction (+:t)
@ -42,8 +43,14 @@ int main ()
 	  }
 	t += val;
      }
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
@ -17,7 +17,8 @@ int main ()
  int q = 0,  h = 0;
  int vectorsize;

-#pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
+#define VL 32
+#pragma acc parallel vector_length(VL) copy(q) copy(ondev)
  /* { dg-note {variable 't' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-1 } */
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
@ -46,8 +47,14 @@ int main ()
 	t += val;
      }
    q = t;
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
@ -19,8 +19,10 @@ int main ()
  int t = 0,  h = 0;
  int workersize;

-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
-	    copyout(workersize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 } */
  {
@ -46,8 +48,13 @@ int main ()
 	  }
 	t += val;
      }
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }
+  workersize = NW;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
@ -19,8 +19,10 @@ int main ()
  int q = 0,  h = 0;
  int workersize;

-#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
-	    copyout(workersize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(q) copy(ondev)
  /* { dg-note {variable 't' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-3 } */
  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-4 } */
@ -50,8 +52,13 @@ int main ()
 	t += val;
      }
    q = t;
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }
+  workersize = NW;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@ -16,8 +16,10 @@ int main ()
  int t = 0, h = 0;
  int workersize, vectorsize;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
-	    copyout(workersize, vectorsize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop worker vector reduction (+:t)
@ -42,9 +44,18 @@ int main ()
 	  }
 	t += val;
      }
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
@ -20,8 +20,9 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
-	    copyout(vectorsize)
+#define VL 32
+#pragma acc parallel vector_length(VL) \
+	    copy(ary) copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop vector
@ -44,8 +45,14 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
@ -23,8 +23,10 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
-	    copyout(workersize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ary) copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } .-3 } */
  {
@ -48,8 +50,13 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }
+  workersize = NW;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@ -20,8 +20,10 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
-	    copyout(workersize, vectorsize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ary) copy(ondev)
  /* { dg-note {variable 'ix' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 } */
  {
 #pragma acc loop worker vector
@ -44,9 +46,18 @@ int main ()
 	else
 	  ary[ix] = ix;
      }
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@ -35,14 +35,27 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
+#define NG 32
+#define NW 32
+#define VL 32
+#pragma acc parallel num_gangs(NG) num_workers(NW) vector_length(VL) \
+  copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    gang (ary);
-    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  gangsize = NG;
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
@ -35,13 +35,20 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
-	    copyout(vectorsize)
+#define VL 32
+#pragma acc parallel vector_length(VL) \
+	    copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    vector (ary);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
@ -39,13 +39,20 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
-	    copyout(workersize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    worker (ary);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
  }
+  workersize = NW;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+#endif

  for (ix = 0; ix < N; ix++)
    {
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@ -35,14 +35,25 @@ int main ()
  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
-	    copyout(workersize, vectorsize)
+#define NW 32
+#define VL 32
+#pragma acc parallel num_workers(NW) vector_length(VL) \
+	    copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    worker (ary);
-    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
-    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
  }
+  workersize = NW;
+  vectorsize = VL;
+#ifdef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN has an upper limit of 'num_workers(16)'.  */
+  if (workersize > 16)
+    workersize = 16;
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
+  vectorsize = 1;
+#endif

  for (ix = 0; ix < N; ix++)
    {