gcc/libgomp/testsuite/libgomp.c/scan-18.c

/* { dg-require-effective-target size32plus } */
/* { dg-additional-options "-O2 -fopenmp -fdump-tree-vect-details" } */
/* { dg-additional-options "-msse2" { target sse2_runtime } } */
/* { dg-additional-options "-mavx" { target avx_runtime } } */
/* { dg-final { scan-tree-dump-times "vectorized \[2-6] loops" 2 "vect" { target sse2_runtime } } } */

extern void abort (void);
int r, a[1024], b[1024];
unsigned short r2, b2[1024];
unsigned char r3, b3[1024];

__attribute__((noipa)) void
foo (int *a, int *b, unsigned short *b2, unsigned char *b3)
{
  #pragma omp for simd reduction (inscan, +:r, r2, r3)
  for (int i = 0; i < 1024; i++)
    {
      {
	b[i] = r;
	b2[i] = r2;
	b3[i] = r3;
      }
      #pragma omp scan exclusive(r, r2, r3)
      { r += a[i]; r2 += a[i]; r3 += a[i]; }
    }
}

__attribute__((noipa)) int
bar (unsigned short *s2p, unsigned char *s3p)
{
  int s = 0;
  unsigned short s2 = 0;
  unsigned char s3 = 0;
  #pragma omp parallel
  #pragma omp for simd reduction (inscan, +:s, s2, s3)
  for (int i = 0; i < 1024; i++)
    {
      { b[i] = s; b2[i] = s2; b3[i] = s3; }
      #pragma omp scan exclusive(s, s2, s3)
      {
	s += 2 * a[i];
	s2 += 2 * a[i];
	s3 += 2 * a[i];
      }
    }
  *s2p = s2;
  *s3p = s3;
  return s;
}

__attribute__((noipa)) void
baz (int *a, int *b, unsigned short *b2, unsigned char *b3)
{
  #pragma omp parallel for simd reduction (inscan, +:r, r2, r3) if (simd: 0)
  for (int i = 0; i < 1024; i++)
    {
      {
	b[i] = r;
	b2[i] = r2;
	b3[i] = r3;
      }
      #pragma omp scan exclusive(r, r2, r3)
      {
	r += a[i];
	r2 += a[i];
	r3 += a[i];
      }
    }
}

__attribute__((noipa)) int
qux (unsigned short *s2p, unsigned char *s3p)
{
  int s = 0;
  unsigned short s2 = 0;
  unsigned char s3 = 0;
  #pragma omp parallel for simd simdlen (1) reduction (inscan, +:s, s2, s3)
  for (int i = 0; i < 1024; i++)
    {
      { b[i] = s; b2[i] = s2; b3[i] = s3; }
      #pragma omp scan exclusive(s, s2, s3)
      { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; }
    }
  *s2p = s2;
  *s3p = s3;
  return s;
}

int
main ()
{
  int s = 0;
  unsigned short s2;
  unsigned char s3;
  for (int i = 0; i < 1024; ++i)
    {
      a[i] = i;
      b[i] = -1;
      b2[i] = -1;
      b3[i] = -1;
      asm ("" : "+g" (i));
    }
  #pragma omp parallel
  foo (a, b, b2, b3);
  if (r != 1024 * 1023 / 2
      || r2 != (unsigned short) r
      || r3 != (unsigned char) r)
    abort ();
  for (int i = 0; i < 1024; ++i)
    {
      if (b[i] != s
	  || b2[i] != (unsigned short) s
	  || b3[i] != (unsigned char) s)
	abort ();
      else
	{
	  b[i] = 25;
	  b2[i] = 24;
	  b3[i] = 26;
	}
      s += i;
    }
  if (bar (&s2, &s3) != 1024 * 1023)
    abort ();
  if (s2 != (unsigned short) (1024 * 1023)
      || s3 != (unsigned char) (1024 * 1023))
    abort ();
  s = 0;
  for (int i = 0; i < 1024; ++i)
    {
      if (b[i] != s
	  || b2[i] != (unsigned short) s
	  || b3[i] != (unsigned char) s)
	abort ();
      else
	{
	  b[i] = -1;
	  b2[i] = -1;
	  b3[i] = -1;
	}
      s += 2 * i;
    }
  r = 0;
  r2 = 0;
  r3 = 0;
  baz (a, b, b2, b3);
  if (r != 1024 * 1023 / 2
      || r2 != (unsigned short) r
      || r3 != (unsigned char) r)
    abort ();
  s = 0;
  for (int i = 0; i < 1024; ++i)
    {
      if (b[i] != s
	  || b2[i] != (unsigned short) s
	  || b3[i] != (unsigned char) s)
	abort ();
      else
	{
	  b[i] = 25;
	  b2[i] = 24;
	  b3[i] = 26;
	}
      s += i;
    }
  s2 = 0;
  s3 = 0;
  if (qux (&s2, &s3) != 1024 * 1023)
    abort ();
  if (s2 != (unsigned short) (1024 * 1023)
      || s3 != (unsigned char) (1024 * 1023))
    abort ();
  s = 0;
  for (int i = 0; i < 1024; ++i)
    {
      if (b[i] != s
	  || b2[i] != (unsigned short) s
	  || b3[i] != (unsigned char) s)
	abort ();
      s += 2 * i;
    }
  return 0;
}
omp-low.c (struct omp_context): Add for_simd_scan_phase member. * omp-low.c (struct omp_context): Add for_simd_scan_phase member. (maybe_lookup_ctx): Add forward declaration. (omp_find_scan): Likewise. Walk into body of simd if composited with worksharing loop. (scan_omp_simd_scan): New function. (scan_omp_1_stmt): Call it. (lower_rec_simd_input_clauses): Don't create rvar nor rvar2 if ctx->for_simd_scan_phase. (lower_rec_input_clauses): Do much less work for inscan reductions in ctx->for_simd_scan_phase is_simd regions. (lower_omp_scan): Set is_simd also on simd constructs composited with worksharing loop, unless ctx->for_simd_scan_phase. Never emit a sorry message. Don't change GIMPLE_OMP_SCAN stmts into nops and emit their body after in simd constructs composited with worksharing loop. (lower_omp_for_scan): Handle worksharing loop composited with simd. * c-c++-common/gomp/scan-4.c: Don't expect sorry message. * testsuite/libgomp.c/scan-11.c: New test. * testsuite/libgomp.c/scan-12.c: New test. * testsuite/libgomp.c/scan-13.c: New test. * testsuite/libgomp.c/scan-14.c: New test. * testsuite/libgomp.c/scan-15.c: New test. * testsuite/libgomp.c/scan-16.c: New test. * testsuite/libgomp.c/scan-17.c: New test. * testsuite/libgomp.c/scan-18.c: New test. * testsuite/libgomp.c++/scan-9.C: New test. * testsuite/libgomp.c++/scan-10.C: New test. * testsuite/libgomp.c++/scan-11.C: New test. * testsuite/libgomp.c++/scan-12.C: New test. * testsuite/libgomp.c++/scan-13.C: New test. * testsuite/libgomp.c++/scan-14.C: New test. * testsuite/libgomp.c++/scan-15.C: New test. * testsuite/libgomp.c++/scan-16.C: New test. From-SVN: r273157 2019-07-06 09:53:48 +02:00			`/* { dg-require-effective-target size32plus } */`
vectorizer: Fix up -fsimd-cost-model= handling > * testsuite/libgomp.c++/scan-10.C: Add option -fvect-cost-model=cheap. I don't think this is the right thing to do. This just means that at some point between 2013 when -fsimd-cost-model has been introduced and now -fsimd-cost-model= option at least partially stopped working properly. As documented, -fsimd-cost-model= overrides the -fvect-cost-model= setting for OpenMP simd loops (loop->force_vectorize is true) if specified differently from default. In tree-vectorizer.h we have: static inline bool unlimited_cost_model (loop_p loop) { if (loop != NULL && loop->force_vectorize && flag_simd_cost_model != VECT_COST_MODEL_DEFAULT) return flag_simd_cost_model == VECT_COST_MODEL_UNLIMITED; return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED); } and use it in various places, but we also just use flag_vect_cost_model in lots of places (and in one spot use flag_simd_cost_model, not sure if we are sure it is a force_vectorize loop or what). So, IMHO we should change the above inline function to loop_cost_model and let it return the cost model and then just reimplement unlimited_cost_model as return loop_cost_model (loop) == VECT_COST_MODEL_UNLIMITED; and then adjust the direct uses of the flag and revert these changes. 2021-10-12 Jakub Jelinek <jakub@redhat.com> gcc/ * tree-vectorizer.h (loop_cost_model): New function. (unlimited_cost_model): Use it. * tree-vect-loop.c (vect_analyze_loop_costing): Use loop_cost_model call instead of flag_vect_cost_model. * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Likewise. (vect_prune_runtime_alias_test_list): Likewise. Also use it instead of flag_simd_cost_model. gcc/testsuite/ * gcc.dg/gomp/simd-2.c: Remove option -fvect-cost-model=cheap. * gcc.dg/gomp/simd-3.c: Likewise. libgomp/ * testsuite/libgomp.c/scan-11.c: Remove option -fvect-cost-model=cheap. * testsuite/libgomp.c/scan-12.c: Likewise. * testsuite/libgomp.c/scan-13.c: Likewise. * testsuite/libgomp.c/scan-14.c: Likewise. * testsuite/libgomp.c/scan-15.c: Likewise. * testsuite/libgomp.c/scan-16.c: Likewise. * testsuite/libgomp.c/scan-17.c: Likewise. * testsuite/libgomp.c/scan-18.c: Likewise. * testsuite/libgomp.c/scan-19.c: Likewise. * testsuite/libgomp.c/scan-20.c: Likewise. * testsuite/libgomp.c/scan-21.c: Likewise. * testsuite/libgomp.c/scan-22.c: Likewise. * testsuite/libgomp.c++/scan-9.C: Likewise. * testsuite/libgomp.c++/scan-10.C: Likewise. * testsuite/libgomp.c++/scan-11.C: Likewise. * testsuite/libgomp.c++/scan-12.C: Likewise. * testsuite/libgomp.c++/scan-13.C: Likewise. * testsuite/libgomp.c++/scan-14.C: Likewise. * testsuite/libgomp.c++/scan-15.C: Likewise. * testsuite/libgomp.c++/scan-16.C: Likewise. 2021-10-12 09:28:10 +02:00			`/* { dg-additional-options "-O2 -fopenmp -fdump-tree-vect-details" } */`
re PR libgomp/91530 (Several libgomp./scan- tests FAIL without avx_runtime) PR libgomp/91530 * testsuite/libgomp.c/scan-11.c: Add -msse2 option for sse2_runtime targets. * testsuite/libgomp.c/scan-12.c: Likewise. * testsuite/libgomp.c/scan-13.c: Likewise. * testsuite/libgomp.c/scan-14.c: Likewise. * testsuite/libgomp.c/scan-15.c: Likewise. * testsuite/libgomp.c/scan-16.c: Likewise. * testsuite/libgomp.c/scan-17.c: Likewise. * testsuite/libgomp.c/scan-18.c: Likewise. * testsuite/libgomp.c/scan-19.c: Likewise. * testsuite/libgomp.c/scan-20.c: Likewise. * testsuite/libgomp.c++/scan-9.C: Likewise. * testsuite/libgomp.c++/scan-10.C: Likewise. * testsuite/libgomp.c++/scan-11.C: Likewise. * testsuite/libgomp.c++/scan-12.C: Likewise. * testsuite/libgomp.c++/scan-14.C: Likewise. * testsuite/libgomp.c++/scan-15.C: Likewise. * testsuite/libgomp.c++/scan-13.C: Likewise. Use sse2_runtime instead of i?86-- x86_64-- as target for scan-tree-dump-times. * testsuite/libgomp.c++/scan-16.C: Likewise. From-SVN: r274947 2019-08-27 12:45:55 +02:00			`/* { dg-additional-options "-msse2" { target sse2_runtime } } */`
omp-low.c (struct omp_context): Add for_simd_scan_phase member. * omp-low.c (struct omp_context): Add for_simd_scan_phase member. (maybe_lookup_ctx): Add forward declaration. (omp_find_scan): Likewise. Walk into body of simd if composited with worksharing loop. (scan_omp_simd_scan): New function. (scan_omp_1_stmt): Call it. (lower_rec_simd_input_clauses): Don't create rvar nor rvar2 if ctx->for_simd_scan_phase. (lower_rec_input_clauses): Do much less work for inscan reductions in ctx->for_simd_scan_phase is_simd regions. (lower_omp_scan): Set is_simd also on simd constructs composited with worksharing loop, unless ctx->for_simd_scan_phase. Never emit a sorry message. Don't change GIMPLE_OMP_SCAN stmts into nops and emit their body after in simd constructs composited with worksharing loop. (lower_omp_for_scan): Handle worksharing loop composited with simd. * c-c++-common/gomp/scan-4.c: Don't expect sorry message. * testsuite/libgomp.c/scan-11.c: New test. * testsuite/libgomp.c/scan-12.c: New test. * testsuite/libgomp.c/scan-13.c: New test. * testsuite/libgomp.c/scan-14.c: New test. * testsuite/libgomp.c/scan-15.c: New test. * testsuite/libgomp.c/scan-16.c: New test. * testsuite/libgomp.c/scan-17.c: New test. * testsuite/libgomp.c/scan-18.c: New test. * testsuite/libgomp.c++/scan-9.C: New test. * testsuite/libgomp.c++/scan-10.C: New test. * testsuite/libgomp.c++/scan-11.C: New test. * testsuite/libgomp.c++/scan-12.C: New test. * testsuite/libgomp.c++/scan-13.C: New test. * testsuite/libgomp.c++/scan-14.C: New test. * testsuite/libgomp.c++/scan-15.C: New test. * testsuite/libgomp.c++/scan-16.C: New test. From-SVN: r273157 2019-07-06 09:53:48 +02:00			`/* { dg-additional-options "-mavx" { target avx_runtime } } */`
			`/* { dg-final { scan-tree-dump-times "vectorized \[2-6] loops" 2 "vect" { target sse2_runtime } } } */`

			`extern void abort (void);`
			`int r, a[1024], b[1024];`
			`unsigned short r2, b2[1024];`
			`unsigned char r3, b3[1024];`

			`__attribute__((noipa)) void`
			`foo (int a, int b, unsigned short b2, unsigned char b3)`
			`{`
			`#pragma omp for simd reduction (inscan, +:r, r2, r3)`
			`for (int i = 0; i < 1024; i++)`
			`{`
			`{`
			`b[i] = r;`
			`b2[i] = r2;`
			`b3[i] = r3;`
			`}`
			`#pragma omp scan exclusive(r, r2, r3)`
			`{ r += a[i]; r2 += a[i]; r3 += a[i]; }`
			`}`
			`}`

			`__attribute__((noipa)) int`
			`bar (unsigned short s2p, unsigned char s3p)`
			`{`
			`int s = 0;`
			`unsigned short s2 = 0;`
			`unsigned char s3 = 0;`
			`#pragma omp parallel`
			`#pragma omp for simd reduction (inscan, +:s, s2, s3)`
			`for (int i = 0; i < 1024; i++)`
			`{`
			`{ b[i] = s; b2[i] = s2; b3[i] = s3; }`
			`#pragma omp scan exclusive(s, s2, s3)`
			`{`
			`s += 2 * a[i];`
			`s2 += 2 * a[i];`
			`s3 += 2 * a[i];`
			`}`
			`}`
			`*s2p = s2;`
			`*s3p = s3;`
			`return s;`
			`}`

			`__attribute__((noipa)) void`
			`baz (int a, int b, unsigned short b2, unsigned char b3)`
			`{`
			`#pragma omp parallel for simd reduction (inscan, +:r, r2, r3) if (simd: 0)`
			`for (int i = 0; i < 1024; i++)`
			`{`
			`{`
			`b[i] = r;`
			`b2[i] = r2;`
			`b3[i] = r3;`
			`}`
			`#pragma omp scan exclusive(r, r2, r3)`
			`{`
			`r += a[i];`
			`r2 += a[i];`
			`r3 += a[i];`
			`}`
			`}`
			`}`

			`__attribute__((noipa)) int`
			`qux (unsigned short s2p, unsigned char s3p)`
			`{`
			`int s = 0;`
			`unsigned short s2 = 0;`
			`unsigned char s3 = 0;`
			`#pragma omp parallel for simd simdlen (1) reduction (inscan, +:s, s2, s3)`
			`for (int i = 0; i < 1024; i++)`
			`{`
			`{ b[i] = s; b2[i] = s2; b3[i] = s3; }`
			`#pragma omp scan exclusive(s, s2, s3)`
			`{ s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; }`
			`}`
			`*s2p = s2;`
			`*s3p = s3;`
			`return s;`
			`}`

			`int`
			`main ()`
			`{`
			`int s = 0;`
			`unsigned short s2;`
			`unsigned char s3;`
			`for (int i = 0; i < 1024; ++i)`
			`{`
			`a[i] = i;`
			`b[i] = -1;`
			`b2[i] = -1;`
			`b3[i] = -1;`
			`asm ("" : "+g" (i));`
			`}`
			`#pragma omp parallel`
			`foo (a, b, b2, b3);`
			`if (r != 1024 * 1023 / 2`
			`\|\| r2 != (unsigned short) r`
			`\|\| r3 != (unsigned char) r)`
			`abort ();`
			`for (int i = 0; i < 1024; ++i)`
			`{`
			`if (b[i] != s`
			`\|\| b2[i] != (unsigned short) s`
			`\|\| b3[i] != (unsigned char) s)`
			`abort ();`
			`else`
			`{`
			`b[i] = 25;`
			`b2[i] = 24;`
			`b3[i] = 26;`
			`}`
			`s += i;`
			`}`
			`if (bar (&s2, &s3) != 1024 * 1023)`
			`abort ();`
			`if (s2 != (unsigned short) (1024 * 1023)`
			`\|\| s3 != (unsigned char) (1024 * 1023))`
			`abort ();`
			`s = 0;`
			`for (int i = 0; i < 1024; ++i)`
			`{`
			`if (b[i] != s`
			`\|\| b2[i] != (unsigned short) s`
			`\|\| b3[i] != (unsigned char) s)`
			`abort ();`
			`else`
			`{`
			`b[i] = -1;`
			`b2[i] = -1;`
			`b3[i] = -1;`
			`}`
			`s += 2 * i;`
			`}`
			`r = 0;`
			`r2 = 0;`
			`r3 = 0;`
			`baz (a, b, b2, b3);`
			`if (r != 1024 * 1023 / 2`
			`\|\| r2 != (unsigned short) r`
			`\|\| r3 != (unsigned char) r)`
			`abort ();`
			`s = 0;`
			`for (int i = 0; i < 1024; ++i)`
			`{`
			`if (b[i] != s`
			`\|\| b2[i] != (unsigned short) s`
			`\|\| b3[i] != (unsigned char) s)`
			`abort ();`
			`else`
			`{`
			`b[i] = 25;`
			`b2[i] = 24;`
			`b3[i] = 26;`
			`}`
			`s += i;`
			`}`
			`s2 = 0;`
			`s3 = 0;`
			`if (qux (&s2, &s3) != 1024 * 1023)`
			`abort ();`
			`if (s2 != (unsigned short) (1024 * 1023)`
			`\|\| s3 != (unsigned char) (1024 * 1023))`
			`abort ();`
			`s = 0;`
			`for (int i = 0; i < 1024; ++i)`
			`{`
			`if (b[i] != s`
			`\|\| b2[i] != (unsigned short) s`
			`\|\| b3[i] != (unsigned char) s)`
			`abort ();`
			`s += 2 * i;`
			`}`
			`return 0;`
			`}`