c2ebf4f10d
The following change adds support for non-rectangular simd loops. While working on that, I've noticed we actually don't vectorize collapsed simd loops at all, because the code that I thought would be vectorizable actually is not vectorized. While in theory for the constant lower/upper bounds and constant step of all but the outermost loop we could in theory vectorize by computing the seprate iterators using vectorized division and modulo for each of them from the single iterator that increments by 1 from 0 to total iteration count in the loop nest, I think that would be fairly expensive and the chances of the loop body being vectorizable would be low e.g. because of array indices unlikely to be linear and would need scatters/gathers. This patch changes the generated code to vectorize only the innermost loop which has higher chance of being vectorized. Below is the list of tests and function names in which the patch resulted in vectorizing something that hasn't been vectorized before (ok, the first line is a new test). I've also found that the vectorizer will not vectorize loops with non-constant steps, I plan to do something about those incrementally on the omp-expand.c side (basically, compute number of iterations before the loop and use a 0 to number_of_iterations step 1 IV as the main one). I have problem with the composite simd vectorization though. The point is that each thread (or task etc.) is given only a range of consecutive iterations, so somewhere earlier it computes total number of iterations and splits the work between the workers and then the intent is to try to vectorize it. So, each thread is then given a begin ... end-1 range that it would handle. This means that from the single begin value I need to compute the individual iteration vars I should start at and then goto into the loop nest to begin iterating there (and actually compute how many iterations the innermost loop should do each time so that it stops before end). Very roughly the IL I emit is something like: int t[100][100][100]; void foo (int a, int b, int c, int d, int e, int f, int g, int h, int u, int v, int w, int x) { int i, j, k; int cnt; if (x) { i = u; j = v; k = w; goto doit; } for (i = a; i < b; i += c) for (j = d; j < e; j += f) { k = g; doit: for (; k < h; k++) t[i][j][k] += i + j + k; } } Unfortunately, some pass then turns the innermost loop to have more than 2 basic blocks and it isn't vectorized because of that. Also, I have disabled (for now) SIMTization of collapsed simd loops, because for SIMT it would be using a single thread anyway and I didn't want to bother with checking SIMT on all places I've been changing. If SIMT support is added for some or all collapsed loops, that omp-low.c change needs to be reverted. Here is that list of what hasn't been vectorized before and is now: gcc/testsuite/gcc.dg/vect/vect-simd-17.c doit gcc/testsuite/gfortran.dg/gomp/openmp-simd-6.f90 bar libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-10.c f28_taskloop_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-10.c _Z24f28_taskloop_simd_normalv._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-11.c f25_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-11.c f26_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-11.c f27_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-11.c f28_tpf_simd_guided32._omp_fn.1 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-11.c f28_tpf_simd_runtime._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-11.c _Z17f25_t_simd_normaliiiiiii._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-11.c _Z17f26_t_simd_normaliiiixxi._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-11.c _Z17f27_t_simd_normalv._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-11.c _Z20f28_tpf_simd_runtimev._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-11.c _Z21f28_tpf_simd_guided32v._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-2.c f7_simd_normal libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-2.c f7_simd_normal libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-2.c f8_f_simd_guided32 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-2.c f8_f_simd_guided32 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-2.c f8_f_simd_runtime libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-2.c f8_f_simd_runtime libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-2.c f8_pf_simd_guided32._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-2.c f8_pf_simd_runtime._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-2.c _Z18f8_pf_simd_runtimev._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-2.c _Z19f8_pf_simd_guided32v._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-4.c f8_taskloop_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-4.c _Z23f8_taskloop_simd_normalv._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-5.c f7_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-5.c f8_tpf_simd_guided32._omp_fn.1 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-5.c f8_tpf_simd_runtime._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-5.c _Z16f7_t_simd_normalv._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-5.c _Z19f8_tpf_simd_runtimev._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-5.c _Z20f8_tpf_simd_guided32v._omp_fn.1 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c f25_simd_normal libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f25_simd_normal libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c f26_simd_normal libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f26_simd_normal libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c f27_simd_normal libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f27_simd_normal libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c f28_f_simd_guided32 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f28_f_simd_guided32 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c f28_f_simd_runtime libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f28_f_simd_runtime libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f28_pf_simd_guided32._omp_fn.0 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/for-8.c f28_pf_simd_runtime._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c _Z19f28_pf_simd_runtimev._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/for-8.c _Z20f28_pf_simd_guided32v._omp_fn.0 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/master-combined-1.c main._omp_fn.9 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/master-combined-1.c main._omp_fn.9 libgomp/testsuite/libgomp.c++/../libgomp.c-c++-common/simd-1.c f2 libgomp/testsuite/libgomp.c/../libgomp.c-c++-common/simd-1.c f2 libgomp/testsuite/libgomp.c/pr70680-2.c f1._omp_fn.0 libgomp/testsuite/libgomp.c/pr70680-2.c f2._omp_fn.0 libgomp/testsuite/libgomp.c/pr70680-2.c f3._omp_fn.0 libgomp/testsuite/libgomp.c/pr70680-2.c f4._omp_fn.0 libgomp/testsuite/libgomp.c/simd-8.c foo libgomp/testsuite/libgomp.c/simd-9.c bar libgomp/testsuite/libgomp.c/simd-9.c foo 2020-09-25 Jakub Jelinek <jakub@redhat.com> gcc/ * omp-low.c (scan_omp_1_stmt): Don't call scan_omp_simd for collapse > 1 loops as simt doesn't support collapsed loops yet. * omp-expand.c (expand_omp_for_init_counts, expand_omp_for_init_vars): Small tweaks to function comment. (expand_omp_simd): Rewritten collapse > 1 support to only attempt to vectorize the innermost loop and emit set of outer loops around it. For non-composite simd with collapse > 1 without broken loop don't even try to compute number of iterations first. Add support for non-rectangular simd loops. (expand_omp_for): Don't sorry_at on non-rectangular simd loops. gcc/testsuite/ * gcc.dg/vect/vect-simd-17.c: New test. libgomp/ * testsuite/libgomp.c/loop-25.c: New test.
297 lines
7.8 KiB
C
297 lines
7.8 KiB
C
/* { dg-do run } */
|
|
/* { dg-additional-options "-O2 -fopenmp -fdump-tree-vect-details" } */
|
|
/* { dg-additional-options "-mavx" { target avx_runtime } } */
|
|
|
|
int x, i, j;
|
|
volatile int a, b, c, d, e, f, g, h;
|
|
int k[11][101];
|
|
extern void abort (void);
|
|
|
|
int
|
|
main ()
|
|
{
|
|
int niters, err = 0;
|
|
for (i = 1; i <= 10; i++)
|
|
for (j = 1; j <= 10 * i; j++)
|
|
{
|
|
k[i][j] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = 1; b = 11; c = 1; d = 0; e = 1; f = 10; g = 1; h = 1;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = 1; i <= 10; i++)
|
|
for (j = 1; j <= 10 * i; j++)
|
|
{
|
|
err |= (i < 1);
|
|
err |= (i > 10);
|
|
err |= (j < 1);
|
|
err |= (j > 10 * i);
|
|
err |= (k[i][j] != 1);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 11 || j != 101 || x != 10340 || niters != 550 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i < 1);
|
|
err |= (i > 10);
|
|
err |= (j < 1);
|
|
err |= (j > 10 * i);
|
|
err |= (k[i][j] != 2);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 11 || j != 101 || x != 10340 || niters != 550 || err)
|
|
abort ();
|
|
for (i = 1; i <= 10; i++)
|
|
for (j = 1; j <= 10 * i; j++)
|
|
if (k[i][j] == 3)
|
|
k[i][j] = 0;
|
|
else
|
|
abort ();
|
|
for (i = 0; i < 11; i++)
|
|
for (j = 0; j < 101; j++)
|
|
if (k[i][j] != 0)
|
|
abort ();
|
|
for (i = 0; i < 10; i++)
|
|
for (j = 0; j < 10 * i; j++)
|
|
{
|
|
k[i][j] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = 0; b = 10; c = 1; d = 0; e = 0; f = 10; g = 0; h = 1;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = 0; i < 10; i++)
|
|
for (j = 0; j < 10 * i; j++)
|
|
{
|
|
err |= (i < 0);
|
|
err |= (i >= 10);
|
|
err |= (j < 0);
|
|
err |= (j >= 10 * i);
|
|
err |= (k[i][j] != 1);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 10 || j != 90 || x != 9305 || niters != 450 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i < 0);
|
|
err |= (i >= 10);
|
|
err |= (j < 0);
|
|
err |= (j >= 10 * i);
|
|
err |= (k[i][j] != 2);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 10 || j != 90 || x != 9305 || niters != 450 || err)
|
|
abort ();
|
|
for (i = 0; i < 10; i++)
|
|
for (j = 0; j < 10 * i; j++)
|
|
if (k[i][j] == 3)
|
|
k[i][j] = 0;
|
|
else
|
|
abort ();
|
|
for (i = 0; i < 11; i++)
|
|
for (j = 0; j < 101; j++)
|
|
if (k[i][j] != 0)
|
|
abort ();
|
|
for (i = 4; i < 10; i++)
|
|
for (j = -9 + 2 * i; j < i; j++)
|
|
{
|
|
k[i][j + 1] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = 4; b = 10; c = 1; d = 2; e = -9; f = 1; g = 0; h = 1;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = 4; i < 10; i++)
|
|
for (j = -9 + 2 * i; j < i; j++)
|
|
{
|
|
err |= (i < 4);
|
|
err |= (i >= 10);
|
|
err |= (j < -9 + 2 * i);
|
|
err |= (j >= i);
|
|
err |= (k[i][j + 1] != 1);
|
|
k[i][j + 1]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (/*i != 10 || j != 9 || */x != 8199 || niters != 15 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i < 4);
|
|
err |= (i >= 10);
|
|
err |= (j < -9 + 2 * i);
|
|
err |= (j >= i);
|
|
err |= (k[i][j + 1] != 2);
|
|
k[i][j + 1]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (/*i != 10 || j != 9 || */x != 8199 || niters != 15 || err)
|
|
abort ();
|
|
for (i = 4; i < 10; i++)
|
|
for (j = -9 + 2 * i; j < i; j++)
|
|
if (k[i][j + 1] == 3)
|
|
k[i][j + 1] = 0;
|
|
else
|
|
abort ();
|
|
for (i = 0; i < 11; i++)
|
|
for (j = 0; j < 101; j++)
|
|
if (k[i][j] != 0)
|
|
abort ();
|
|
for (i = 1; i < 10; i += 2)
|
|
for (j = 1; j < i + 1; j++)
|
|
{
|
|
k[i][j] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = 1; b = 10; c = 2; d = 0; e = 1; f = 1; g = 1; h = 1;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = 1; i < 10; i += 2)
|
|
for (j = 1; j < i + 1; j++)
|
|
{
|
|
err |= (i < 1);
|
|
err |= (i >= 10);
|
|
err |= (j < 1);
|
|
err |= (j >= i + 1);
|
|
err |= (k[i][j] != 1);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 11 || j != 10 || x != 9225 || niters != 25 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i < 1);
|
|
err |= (i >= 10);
|
|
err |= (j < 1);
|
|
err |= (j >= i + 1);
|
|
err |= (k[i][j] != 2);
|
|
k[i][j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 11 || j != 10 || x != 9225 || niters != 25 || err)
|
|
abort ();
|
|
for (i = 1; i < 10; i += 2)
|
|
for (j = 1; j < i + 1; j++)
|
|
if (k[i][j] == 3)
|
|
k[i][j] = 0;
|
|
else
|
|
abort ();
|
|
for (i = 0; i < 11; i++)
|
|
for (j = 0; j < 101; j++)
|
|
if (k[i][j] != 0)
|
|
abort ();
|
|
for (j = -11; j >= -41; j -= 15)
|
|
{
|
|
k[0][-j] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = 4; b = 8; c = 12; d = -8; e = -9; f = -3; g = 6; h = 15;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = 4; i < 8; i += 12)
|
|
for (j = -8 * i - 9; j < i * -3 + 6; j += 15)
|
|
{
|
|
err |= (i != 4);
|
|
err |= (j < -41);
|
|
err |= (j > -11);
|
|
err |= (k[0][-j] != 1);
|
|
k[0][-j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 16 || j != 4 || x != 5109 || niters != 3 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i != 4);
|
|
err |= (j < -41);
|
|
err |= (j > -11);
|
|
err |= (k[0][-j] != 2);
|
|
k[0][-j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (i != 16 || j != 4 || x != 5109 || niters != 3 || err)
|
|
abort ();
|
|
for (j = -11; j >= -41; j -= 15)
|
|
if (k[0][-j] == 3)
|
|
k[0][-j] = 0;
|
|
else
|
|
abort ();
|
|
for (j = -11; j >= -41; j--)
|
|
if (k[0][-j] != 0)
|
|
abort ();
|
|
for (j = -34; j <= -7; j++)
|
|
{
|
|
k[0][-j] = 1;
|
|
asm volatile ("" : : : "memory");
|
|
}
|
|
a = -13; b = 7; c = 12; d = 3; e = 5; f = 0; g = -6; h = 1;
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = -13; i < 7; i += 12)
|
|
for (j = 3 * i + 5; j < -6; j++)
|
|
{
|
|
err |= (i != -13);
|
|
err |= (j < -34);
|
|
err |= (j > -7);
|
|
err |= (k[0][-j] != 1);
|
|
k[0][-j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (/*i != 11 || j != 2 || */x != -12295 || niters != 28 || err)
|
|
abort ();
|
|
niters = 0; i = -100; j = -100; x = -100;
|
|
#pragma omp parallel for simd collapse(2) lastprivate (i, j, x) reduction(+:niters) reduction(|:err)
|
|
for (i = a; i < b; i += c)
|
|
for (j = d * i + e; j < g + i * f; j += h)
|
|
{
|
|
err |= (i != -13);
|
|
err |= (j < -34);
|
|
err |= (j > -7);
|
|
err |= (k[0][-j] != 2);
|
|
k[0][-j]++;
|
|
x = i * 1024 + (j & 1023);
|
|
niters++;
|
|
}
|
|
if (/*i != 11 || j != 2 || */x != -12295 || niters != 28 || err)
|
|
abort ();
|
|
for (j = -34; j <= -7; j++)
|
|
if (k[0][-j] == 3)
|
|
k[0][-j] = 0;
|
|
else
|
|
abort ();
|
|
return 0;
|
|
}
|