[omp, simt] Handle alternative IV

Consider the test-case libgomp.c/pr81778.c added in this commit, with
this core loop (note: CANARY_SIZE set to 0 for simplicity):
...
  int s = 1;
  #pragma omp target simd
  for (int i = N - 1; i > -1; i -= s)
    a[i] = 1;
...
which, given that N is 32, sets a[0..31] to 1.

After omp-expand, this looks like:
...
  <bb 5> :
  simduid.7 = .GOMP_SIMT_ENTER (simduid.7);
  .omp_simt.8 = .GOMP_SIMT_ENTER_ALLOC (simduid.7);
  D.3193 = -s;
  s.9 = s;
  D.3204 = .GOMP_SIMT_LANE ();
  D.3205 = -s.9;
  D.3206 = (int) D.3204;
  D.3207 = D.3205 * D.3206;
  i = D.3207 + 31;
  D.3209 = 0;
  D.3210 = -s.9;
  D.3211 = D.3210 - i;
  D.3210 = -s.9;
  D.3212 = D.3211 / D.3210;
  D.3213 = (unsigned int) D.3212;
  D.3213 = i >= 0 ? D.3213 : 0;

  <bb 19> :
  if (D.3209 < D.3213)
    goto <bb 6>; [87.50%]
  else
    goto <bb 7>; [12.50%]

  <bb 6> :
  a[i] = 1;
  D.3215 = -s.9;
  D.3219 = .GOMP_SIMT_VF ();
  D.3216 = (int) D.3219;
  D.3220 = D.3215 * D.3216;
  i = D.3220 + i;
  D.3209 = D.3209 + 1;
  goto <bb 19>; [100.00%]
...

On nvptx, the first time bb6 is executed, i is in the 0..31 range (depending
on the lane that is executing) at bb entry.

So we have the following sequence:
- a[0..31] is set to 1
- i is updated to -32..-1
- D.3209 is updated to 1 (being 0 initially)
- bb19 is executed, and if condition (D.3209 < D.3213) == (1 < 32) evaluates
  to true
- bb6 is once more executed, which should not happen because all the elements
  that needed to be handled were already handled.
- consequently, elements that should not be written are written
- with CANARY_SIZE == 0, we may run into a libgomp error:
  ...
  libgomp: cuCtxSynchronize error: an illegal memory access was encountered
  ...
  and with CANARY_SIZE unmodified, we run into:
  ...
  Expected 0, got 1 at base[-961]
  Aborted (core dumped)
  ...

The cause of this is as follows:
- because the step s is a variable rather than a constant, an alternative
  IV (D.3209 in our example) is generated in expand_omp_simd, and the
  loop condition is tested in terms of the alternative IV rather than
  the original IV (i in our example).
- the SIMT code in expand_omp_simd works by modifying step and initial value.
- The initial value fd->loop.n1 is loaded into a variable n1, which is
  modified by the SIMT code and then used there-after.
- The step fd->loop.step is loaded into a variable step, which is modified
  by the SIMT code, but afterwards there are uses of both step and
  fd->loop.step.
- There are uses of fd->loop.step in the alternative IV handling code,
  which should use step instead.

Fix this by introducing an additional variable orig_step, which is not
modified by the SIMT code and replacing all remaining uses of fd->loop.step
by either step or orig_step.

Build on x86_64-linux with nvptx accelerator, tested libgomp.

This fixes for-5.c and for-6.c FAILs I'm currently seeing on a quadro m1200
with driver 450.66.

gcc/ChangeLog:

2020-10-02  Tom de Vries  <tdevries@suse.de>

	* omp-expand.c (expand_omp_simd): Add step_orig, and replace uses of
	fd->loop.step by either step or orig_step.

libgomp/ChangeLog:

2020-10-02  Tom de Vries  <tdevries@suse.de>

	* testsuite/libgomp.c/pr81778.c: New test.
This commit is contained in:
Tom de Vries 2020-10-01 18:11:02 +02:00
parent 7d6f7aa409
commit fc14ff6111
2 changed files with 54 additions and 5 deletions

View File

@ -6360,6 +6360,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
n2 = OMP_CLAUSE_DECL (innerc);
}
tree step = fd->loop.step;
tree orig_step = step; /* May be different from step if is_simt. */
bool is_simt = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
OMP_CLAUSE__SIMT_);
@ -6510,7 +6511,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
tree altv = NULL_TREE, altn2 = NULL_TREE;
if (fd->collapse == 1
&& !broken_loop
&& TREE_CODE (fd->loops[0].step) != INTEGER_CST)
&& TREE_CODE (orig_step) != INTEGER_CST)
{
/* The vectorizer currently punts on loops with non-constant steps
for the main IV (can't compute number of iterations and gives up
@ -6526,7 +6527,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
itype = signed_type_for (itype);
t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1));
t = fold_build2 (PLUS_EXPR, itype,
fold_convert (itype, fd->loop.step), t);
fold_convert (itype, step), t);
t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2));
t = fold_build2 (MINUS_EXPR, itype, t,
fold_convert (itype, fd->loop.v));
@ -6534,10 +6535,10 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
t = fold_build2 (TRUNC_DIV_EXPR, itype,
fold_build1 (NEGATE_EXPR, itype, t),
fold_build1 (NEGATE_EXPR, itype,
fold_convert (itype, fd->loop.step)));
fold_convert (itype, step)));
else
t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
fold_convert (itype, fd->loop.step));
fold_convert (itype, step));
t = fold_convert (TREE_TYPE (altv), t);
altn2 = create_tmp_var (TREE_TYPE (altv));
expand_omp_build_assign (&gsi, altn2, t);
@ -6685,7 +6686,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
if (is_simt)
{
gsi = gsi_start_bb (l2_bb);
step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), orig_step, step);
if (POINTER_TYPE_P (type))
t = fold_build_pointer_plus (fd->loop.v, step);
else

View File

@ -0,0 +1,48 @@
/* Minimized from for-5.c. */
#include <stdio.h>
#include <stdlib.h>
/* Size of array we want to write. */
#define N 32
/* Size of extra space before and after. */
#define CANARY_SIZE (N * 32)
/* Start of array we want to write. */
#define BASE (CANARY_SIZE)
// Total size to be allocated.
#define ALLOC_SIZE (CANARY_SIZE + N + CANARY_SIZE)
#pragma omp declare target
int a[ALLOC_SIZE];
#pragma omp end declare target
int
main (void)
{
/* Use variable step in for loop. */
int s = 1;
#pragma omp target update to(a)
/* Write a[BASE] .. a[BASE + N - 1]. */
#pragma omp target simd
for (int i = N - 1; i > -1; i -= s)
a[BASE + i] = 1;
#pragma omp target update from(a)
for (int i = 0; i < ALLOC_SIZE; i++)
{
int expected = (BASE <= i && i < BASE + N) ? 1 : 0;
if (a[i] == expected)
continue;
printf ("Expected %d, got %d at base[%d]\n", expected, a[i], i - BASE);
abort ();
}
return 0;
}