1546 lines
54 KiB
C++
1546 lines
54 KiB
C++
/* Decompose OpenACC 'kernels' constructs into parts, a sequence of compute
|
|
constructs
|
|
|
|
Copyright (C) 2020-2021 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "backend.h"
|
|
#include "target.h"
|
|
#include "tree.h"
|
|
#include "langhooks.h"
|
|
#include "gimple.h"
|
|
#include "tree-pass.h"
|
|
#include "cgraph.h"
|
|
#include "fold-const.h"
|
|
#include "gimplify.h"
|
|
#include "gimple-iterator.h"
|
|
#include "gimple-walk.h"
|
|
#include "gomp-constants.h"
|
|
#include "omp-general.h"
|
|
#include "diagnostic-core.h"
|
|
|
|
|
|
/* This preprocessing pass is run immediately before lower_omp. It decomposes
|
|
OpenACC 'kernels' constructs into parts, a sequence of compute constructs.
|
|
|
|
The translation is as follows:
|
|
- The entire 'kernels' region is turned into a 'data' region with clauses
|
|
taken from the 'kernels' region. New 'create' clauses are added for all
|
|
variables declared at the top level in the kernels region.
|
|
- Any loop nests annotated with an OpenACC 'loop' directive are wrapped in
|
|
a new compute construct.
|
|
- 'loop' directives without an explicit 'independent' or 'seq' clause
|
|
get an 'auto' clause added; other clauses are preserved on the loop
|
|
or moved to the new surrounding compute construct, as applicable.
|
|
- Any sequences of other code (non-loops, non-OpenACC 'loop's) are wrapped
|
|
in new "gang-single" compute construct: 'worker'/'vector' parallelism is
|
|
preserved, but 'num_gangs (1)' is enforced.
|
|
- Both points above only apply at the topmost level in the region, that
|
|
is, the transformation does not introduce new compute constructs inside
|
|
nested statement bodies. In particular, this means that a
|
|
gang-parallelizable loop inside an 'if' statement is made "gang-single".
|
|
- In order to make the host wait only once for the whole region instead
|
|
of once per device kernel launch, the new compute constructs are
|
|
annotated 'async'. Unless the original 'kernels' construct already was
|
|
marked 'async', the entire region ends with a 'wait' directive. If the
|
|
original 'kernels' construct was marked 'async', the synthesized 'async'
|
|
clauses use the original 'kernels' construct's 'async' argument
|
|
(possibly implicit).
|
|
*/
|
|
|
|
|
|
/*TODO Things are conceptually wrong here: 'loop' clauses may be hidden behind
|
|
'device_type', so we have to defer a lot of processing until we're in the
|
|
offloading compilation. "Fortunately", GCC doesn't support the OpenACC
|
|
'device_type' clause yet, so we get away that. */
|
|
|
|
|
|
/* Helper function for decompose_kernels_region_body. If STMT contains a
|
|
"top-level" OMP_FOR statement, returns a pointer to that statement;
|
|
returns NULL otherwise.
|
|
|
|
A "top-level" OMP_FOR statement is one that is possibly accompanied by
|
|
small snippets of setup code. Specifically, this function accepts an
|
|
OMP_FOR possibly wrapped in a singleton bind and a singleton try
|
|
statement to allow for a local loop variable, but not an OMP_FOR
|
|
statement nested in any other constructs. Alternatively, it accepts a
|
|
non-singleton bind containing only assignments and then an OMP_FOR
|
|
statement at the very end. The former style can be generated by the C
|
|
frontend, the latter by the Fortran frontend. */
|
|
|
|
static gimple *
|
|
top_level_omp_for_in_stmt (gimple *stmt)
|
|
{
|
|
if (gimple_code (stmt) == GIMPLE_OMP_FOR)
|
|
return stmt;
|
|
|
|
if (gimple_code (stmt) == GIMPLE_BIND)
|
|
{
|
|
gimple_seq body = gimple_bind_body (as_a <gbind *> (stmt));
|
|
if (gimple_seq_singleton_p (body))
|
|
{
|
|
/* Accept an OMP_FOR statement, or a try statement containing only
|
|
a single OMP_FOR. */
|
|
gimple *maybe_for_or_try = gimple_seq_first_stmt (body);
|
|
if (gimple_code (maybe_for_or_try) == GIMPLE_OMP_FOR)
|
|
return maybe_for_or_try;
|
|
else if (gimple_code (maybe_for_or_try) == GIMPLE_TRY)
|
|
{
|
|
gimple_seq try_body = gimple_try_eval (maybe_for_or_try);
|
|
if (!gimple_seq_singleton_p (try_body))
|
|
return NULL;
|
|
gimple *maybe_omp_for_stmt = gimple_seq_first_stmt (try_body);
|
|
if (gimple_code (maybe_omp_for_stmt) == GIMPLE_OMP_FOR)
|
|
return maybe_omp_for_stmt;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
gimple_stmt_iterator gsi;
|
|
/* Accept only a block of optional assignments followed by an
|
|
OMP_FOR at the end. No other kinds of statements allowed. */
|
|
for (gsi = gsi_start (body); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *body_stmt = gsi_stmt (gsi);
|
|
if (gimple_code (body_stmt) == GIMPLE_ASSIGN)
|
|
continue;
|
|
else if (gimple_code (body_stmt) == GIMPLE_OMP_FOR
|
|
&& gsi_one_before_end_p (gsi))
|
|
return body_stmt;
|
|
else
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Helper for adjust_region_code: evaluate the statement at GSI_P. */
|
|
|
|
static tree
|
|
adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *wi)
|
|
{
|
|
int *region_code = (int *) wi->info;
|
|
|
|
gimple *stmt = gsi_stmt (*gsi_p);
|
|
switch (gimple_code (stmt))
|
|
{
|
|
case GIMPLE_OMP_FOR:
|
|
{
|
|
tree clauses = gimple_omp_for_clauses (stmt);
|
|
if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
|
|
{
|
|
/* Explicit 'independent' clause. */
|
|
/* Keep going; recurse into loop body. */
|
|
break;
|
|
}
|
|
else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
|
|
{
|
|
/* Explicit 'seq' clause. */
|
|
/* We'll "parallelize" if at some level a loop construct has been
|
|
marked up by the user as unparallelizable ('seq' clause; we'll
|
|
respect that in the later processing). Given that the user has
|
|
explicitly marked it up, this loop construct cannot be
|
|
performance-critical, and in this case it's also fine to
|
|
"parallelize" instead of "gang-single", because any outer or
|
|
inner loops may still exploit the available parallelism. */
|
|
/* Keep going; recurse into loop body. */
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
/* Explicit or implicit 'auto' clause. */
|
|
/* The user would like this loop analyzed ('auto' clause) and
|
|
typically parallelized, but we don't have available yet the
|
|
compiler logic to analyze this, so can't parallelize it here, so
|
|
we'd very likely be running into a performance problem if we
|
|
were to execute this unparallelized, thus forward the whole loop
|
|
nest to 'parloops'. */
|
|
*region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
|
|
/* Terminate: final decision for this region. */
|
|
*handled_ops_p = true;
|
|
return integer_zero_node;
|
|
}
|
|
gcc_unreachable ();
|
|
}
|
|
|
|
case GIMPLE_COND:
|
|
case GIMPLE_GOTO:
|
|
case GIMPLE_SWITCH:
|
|
case GIMPLE_ASM:
|
|
case GIMPLE_TRANSACTION:
|
|
case GIMPLE_RETURN:
|
|
/* Statement that might constitute some looping/control flow pattern. */
|
|
/* The user would like this code analyzed (implicit inside a 'kernels'
|
|
region) and typically parallelized, but we don't have available yet
|
|
the compiler logic to analyze this, so can't parallelize it here, so
|
|
we'd very likely be running into a performance problem if we were to
|
|
execute this unparallelized, thus forward the whole thing to
|
|
'parloops'. */
|
|
*region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
|
|
/* Terminate: final decision for this region. */
|
|
*handled_ops_p = true;
|
|
return integer_zero_node;
|
|
|
|
default:
|
|
/* Keep going. */
|
|
break;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Adjust the REGION_CODE for the region in GS. */
|
|
|
|
static void
|
|
adjust_region_code (gimple_seq gs, int *region_code)
|
|
{
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
wi.info = region_code;
|
|
walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
|
|
}
|
|
|
|
/* Helper function for make_loops_gang_single for walking the tree. If the
|
|
statement indicated by GSI_P is an OpenACC for loop with a gang clause,
|
|
issue a warning and remove the clause. */
|
|
|
|
static tree
|
|
visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *)
|
|
{
|
|
*handled_ops_p = false;
|
|
|
|
gimple *stmt = gsi_stmt (*gsi_p);
|
|
switch (gimple_code (stmt))
|
|
{
|
|
case GIMPLE_OMP_FOR:
|
|
/*TODO Given the current 'adjust_region_code' algorithm, this is
|
|
actually... */
|
|
gcc_unreachable ();
|
|
|
|
{
|
|
tree clauses = gimple_omp_for_clauses (stmt);
|
|
tree prev_clause = NULL;
|
|
for (tree clause = clauses; clause; clause = OMP_CLAUSE_CHAIN (clause))
|
|
{
|
|
if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_GANG)
|
|
{
|
|
/* It makes no sense to have a 'gang' clause in a "gang-single"
|
|
region, so warn and remove it. */
|
|
warning_at (gimple_location (stmt), 0,
|
|
"conditionally executed loop in %<kernels%> region"
|
|
" will be executed by a single gang;"
|
|
" ignoring %<gang%> clause");
|
|
if (prev_clause != NULL)
|
|
OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
|
|
else
|
|
clauses = OMP_CLAUSE_CHAIN (clause);
|
|
|
|
break;
|
|
}
|
|
prev_clause = clause;
|
|
}
|
|
gimple_omp_for_set_clauses (stmt, clauses);
|
|
}
|
|
/* No need to recurse into nested statements; no loop nested inside
|
|
this loop can be gang-partitioned. */
|
|
sorry ("%<gang%> loop in %<gang-single%> region");
|
|
*handled_ops_p = true;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Visit all nested OpenACC loops in the sequence indicated by GS. This
|
|
statement is expected to be inside a gang-single region. Issue a warning
|
|
for any loops inside it that have gang clauses and remove the clauses. */
|
|
|
|
static void
|
|
make_loops_gang_single (gimple_seq gs)
|
|
{
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
|
|
}
|
|
|
|
/* Construct a "gang-single" compute construct at LOC containing the STMTS.
|
|
Annotate with CLAUSES, which must not contain a 'num_gangs' clause, and an
|
|
additional 'num_gangs (1)' clause to force "gang-single" execution. */
|
|
|
|
static gimple *
|
|
make_region_seq (location_t loc, gimple_seq stmts,
|
|
tree num_gangs_clause,
|
|
tree num_workers_clause,
|
|
tree vector_length_clause,
|
|
tree clauses)
|
|
{
|
|
/* This correctly unshares the entire clause chain rooted here. */
|
|
clauses = unshare_expr (clauses);
|
|
|
|
dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
|
|
|
|
/* Figure out the region code for this region. */
|
|
/* Optimistic default: assume "setup code", no looping; thus not
|
|
performance-critical. */
|
|
int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
|
|
adjust_region_code (stmts, ®ion_code);
|
|
|
|
if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
|
|
{
|
|
if (dump_enabled_p ())
|
|
/*TODO MSG_MISSED_OPTIMIZATION? */
|
|
dump_printf_loc (MSG_NOTE, loc_stmts_first,
|
|
"beginning %<gang-single%> part"
|
|
" in OpenACC %<kernels%> region\n");
|
|
|
|
/* Synthesize a 'num_gangs (1)' clause. */
|
|
tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
|
|
OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
|
|
OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
|
|
clauses = gang_single_clause;
|
|
|
|
/* Remove and issue warnings about gang clauses on any OpenACC
|
|
loops nested inside this sequentially executed statement. */
|
|
make_loops_gang_single (stmts);
|
|
}
|
|
else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_NOTE, loc_stmts_first,
|
|
"beginning %<parloops%> part"
|
|
" in OpenACC %<kernels%> region\n");
|
|
|
|
/* As we're transforming a 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
|
|
'GF_OMP_TARGET_KIND_OACC_KERNELS', this isn't doing any of the clauses
|
|
mangling that 'make_region_loop_nest' is doing. */
|
|
/* Re-assemble the clauses stripped off earlier. */
|
|
if (num_gangs_clause != NULL)
|
|
{
|
|
tree c = unshare_expr (num_gangs_clause);
|
|
OMP_CLAUSE_CHAIN (c) = clauses;
|
|
clauses = c;
|
|
}
|
|
if (num_workers_clause != NULL)
|
|
{
|
|
tree c = unshare_expr (num_workers_clause);
|
|
OMP_CLAUSE_CHAIN (c) = clauses;
|
|
clauses = c;
|
|
}
|
|
if (vector_length_clause != NULL)
|
|
{
|
|
tree c = unshare_expr (vector_length_clause);
|
|
OMP_CLAUSE_CHAIN (c) = clauses;
|
|
clauses = c;
|
|
}
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
|
|
/* Build the gang-single region. */
|
|
gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
|
|
gimple_set_location (single_region, loc);
|
|
gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
|
|
gimple_omp_set_body (single_region, single_body);
|
|
|
|
return single_region;
|
|
}
|
|
|
|
/* Helper function for make_region_loop_nest. Adds a 'num_gangs'
|
|
('num_workers', 'vector_length') clause to the given CLAUSES, either the one
|
|
from the parent compute construct (PARENT_CLAUSE) or a new one based on the
|
|
loop's own LOOP_CLAUSE ('gang (num: N)' or similar for 'worker' or 'vector'
|
|
clauses) with the given CLAUSE_CODE. Does nothing if neither PARENT_CLAUSE
|
|
nor LOOP_CLAUSE exist. Returns the new clauses. */
|
|
|
|
static tree
|
|
add_parent_or_loop_num_clause (tree parent_clause, tree loop_clause,
|
|
omp_clause_code clause_code, tree clauses)
|
|
{
|
|
if (parent_clause != NULL)
|
|
{
|
|
tree num_clause = unshare_expr (parent_clause);
|
|
OMP_CLAUSE_CHAIN (num_clause) = clauses;
|
|
clauses = num_clause;
|
|
}
|
|
else if (loop_clause != NULL)
|
|
{
|
|
/* The kernels region does not have a 'num_gangs' clause, but the loop
|
|
itself had a 'gang (num: N)' clause. Honor it by adding a
|
|
'num_gangs (N)' clause on the compute construct. */
|
|
tree num = OMP_CLAUSE_OPERAND (loop_clause, 0);
|
|
tree new_num_clause
|
|
= build_omp_clause (OMP_CLAUSE_LOCATION (loop_clause), clause_code);
|
|
OMP_CLAUSE_OPERAND (new_num_clause, 0) = num;
|
|
OMP_CLAUSE_CHAIN (new_num_clause) = clauses;
|
|
clauses = new_num_clause;
|
|
}
|
|
return clauses;
|
|
}
|
|
|
|
/* Helper for make_region_loop_nest, looking for 'worker (num: N)' or 'vector
|
|
(length: N)' clauses in nested loops. Removes the argument, transferring it
|
|
to the enclosing compute construct (via WI->INFO). If arguments within the
|
|
same loop nest conflict, emits a warning.
|
|
|
|
This function also decides whether to add an 'auto' clause on each of these
|
|
nested loops. */
|
|
|
|
struct adjust_nested_loop_clauses_wi_info
|
|
{
|
|
tree *loop_gang_clause_ptr;
|
|
tree *loop_worker_clause_ptr;
|
|
tree *loop_vector_clause_ptr;
|
|
};
|
|
|
|
static tree
|
|
adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
|
|
struct walk_stmt_info *wi)
|
|
{
|
|
struct adjust_nested_loop_clauses_wi_info *wi_info
|
|
= (struct adjust_nested_loop_clauses_wi_info *) wi->info;
|
|
gimple *stmt = gsi_stmt (*gsi_p);
|
|
|
|
if (gimple_code (stmt) == GIMPLE_OMP_FOR)
|
|
{
|
|
bool add_auto_clause = true;
|
|
tree loop_clauses = gimple_omp_for_clauses (stmt);
|
|
tree loop_clause = loop_clauses;
|
|
for (; loop_clause; loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
|
|
{
|
|
tree *outer_clause_ptr = NULL;
|
|
switch (OMP_CLAUSE_CODE (loop_clause))
|
|
{
|
|
case OMP_CLAUSE_GANG:
|
|
outer_clause_ptr = wi_info->loop_gang_clause_ptr;
|
|
break;
|
|
case OMP_CLAUSE_WORKER:
|
|
outer_clause_ptr = wi_info->loop_worker_clause_ptr;
|
|
break;
|
|
case OMP_CLAUSE_VECTOR:
|
|
outer_clause_ptr = wi_info->loop_vector_clause_ptr;
|
|
break;
|
|
case OMP_CLAUSE_SEQ:
|
|
case OMP_CLAUSE_INDEPENDENT:
|
|
case OMP_CLAUSE_AUTO:
|
|
add_auto_clause = false;
|
|
default:
|
|
break;
|
|
}
|
|
if (outer_clause_ptr != NULL)
|
|
{
|
|
if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL
|
|
&& *outer_clause_ptr == NULL)
|
|
{
|
|
/* Transfer the clause to the enclosing compute construct and
|
|
remove the numerical argument from the 'loop'. */
|
|
*outer_clause_ptr = unshare_expr (loop_clause);
|
|
OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
|
|
}
|
|
else if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL &&
|
|
OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0) != NULL)
|
|
{
|
|
/* See if both of these are the same constant. If they
|
|
aren't, emit a warning. */
|
|
tree old_op = OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0);
|
|
tree new_op = OMP_CLAUSE_OPERAND (loop_clause, 0);
|
|
if (!(cst_and_fits_in_hwi (old_op) &&
|
|
cst_and_fits_in_hwi (new_op) &&
|
|
int_cst_value (old_op) == int_cst_value (new_op)))
|
|
{
|
|
const char *clause_name
|
|
= omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
|
|
error_at (gimple_location (stmt),
|
|
"cannot honor conflicting %qs clause",
|
|
clause_name);
|
|
inform (OMP_CLAUSE_LOCATION (*outer_clause_ptr),
|
|
"location of the previous clause"
|
|
" in the same loop nest");
|
|
}
|
|
OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
|
|
}
|
|
}
|
|
}
|
|
if (add_auto_clause)
|
|
{
|
|
tree auto_clause
|
|
= build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
|
|
OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
|
|
gimple_omp_for_set_clauses (stmt, auto_clause);
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Helper for make_region_loop_nest. Transform OpenACC 'kernels'/'loop'
|
|
construct clauses into OpenACC 'parallel'/'loop' construct ones. */
|
|
|
|
static tree
|
|
transform_kernels_loop_clauses (gimple *omp_for,
|
|
tree num_gangs_clause,
|
|
tree num_workers_clause,
|
|
tree vector_length_clause,
|
|
tree clauses)
|
|
{
|
|
/* If this loop in a kernels region does not have an explicit 'seq',
|
|
'independent', or 'auto' clause, we must give it an explicit 'auto'
|
|
clause.
|
|
We also check for 'gang (num: N)' clauses. These must not appear in
|
|
kernels regions that have their own 'num_gangs' clause. Otherwise, they
|
|
must be converted and put on the region; similarly for 'worker' and
|
|
'vector' clauses. */
|
|
bool add_auto_clause = true;
|
|
tree loop_gang_clause = NULL, loop_worker_clause = NULL,
|
|
loop_vector_clause = NULL;
|
|
tree loop_clauses = gimple_omp_for_clauses (omp_for);
|
|
for (tree loop_clause = loop_clauses;
|
|
loop_clause;
|
|
loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
|
|
{
|
|
bool found_num_clause = false;
|
|
tree *clause_ptr, clause_to_check;
|
|
switch (OMP_CLAUSE_CODE (loop_clause))
|
|
{
|
|
case OMP_CLAUSE_GANG:
|
|
found_num_clause = true;
|
|
clause_ptr = &loop_gang_clause;
|
|
clause_to_check = num_gangs_clause;
|
|
break;
|
|
case OMP_CLAUSE_WORKER:
|
|
found_num_clause = true;
|
|
clause_ptr = &loop_worker_clause;
|
|
clause_to_check = num_workers_clause;
|
|
break;
|
|
case OMP_CLAUSE_VECTOR:
|
|
found_num_clause = true;
|
|
clause_ptr = &loop_vector_clause;
|
|
clause_to_check = vector_length_clause;
|
|
break;
|
|
case OMP_CLAUSE_INDEPENDENT:
|
|
case OMP_CLAUSE_SEQ:
|
|
case OMP_CLAUSE_AUTO:
|
|
add_auto_clause = false;
|
|
default:
|
|
break;
|
|
}
|
|
if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL)
|
|
{
|
|
if (clause_to_check)
|
|
{
|
|
const char *clause_name
|
|
= omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
|
|
const char *parent_clause_name
|
|
= omp_clause_code_name[OMP_CLAUSE_CODE (clause_to_check)];
|
|
error_at (OMP_CLAUSE_LOCATION (loop_clause),
|
|
"argument not permitted on %qs clause"
|
|
" in OpenACC %<kernels%> region with a %qs clause",
|
|
clause_name, parent_clause_name);
|
|
inform (OMP_CLAUSE_LOCATION (clause_to_check),
|
|
"location of OpenACC %<kernels%>");
|
|
}
|
|
/* Copy the 'gang (N)'/'worker (N)'/'vector (N)' clause to the
|
|
enclosing compute construct. */
|
|
*clause_ptr = unshare_expr (loop_clause);
|
|
OMP_CLAUSE_CHAIN (*clause_ptr) = NULL;
|
|
/* Leave a 'gang'/'worker'/'vector' clause on the 'loop', but without
|
|
argument. */
|
|
OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
|
|
}
|
|
}
|
|
if (add_auto_clause)
|
|
{
|
|
tree auto_clause = build_omp_clause (gimple_location (omp_for),
|
|
OMP_CLAUSE_AUTO);
|
|
OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
|
|
loop_clauses = auto_clause;
|
|
}
|
|
gimple_omp_for_set_clauses (omp_for, loop_clauses);
|
|
/* We must also recurse into the loop; it might contain nested loops having
|
|
their own 'worker (num: W)' or 'vector (length: V)' clauses. Turn these
|
|
into 'worker'/'vector' clauses on the compute construct. */
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
struct adjust_nested_loop_clauses_wi_info wi_info;
|
|
wi_info.loop_gang_clause_ptr = &loop_gang_clause;
|
|
wi_info.loop_worker_clause_ptr = &loop_worker_clause;
|
|
wi_info.loop_vector_clause_ptr = &loop_vector_clause;
|
|
wi.info = &wi_info;
|
|
gimple *body = gimple_omp_body (omp_for);
|
|
walk_gimple_seq (body, adjust_nested_loop_clauses, NULL, &wi);
|
|
/* Check if there were conflicting numbers of workers or vector length. */
|
|
if (loop_gang_clause != NULL &&
|
|
OMP_CLAUSE_OPERAND (loop_gang_clause, 0) == NULL)
|
|
loop_gang_clause = NULL;
|
|
if (loop_worker_clause != NULL &&
|
|
OMP_CLAUSE_OPERAND (loop_worker_clause, 0) == NULL)
|
|
loop_worker_clause = NULL;
|
|
if (loop_vector_clause != NULL &&
|
|
OMP_CLAUSE_OPERAND (loop_vector_clause, 0) == NULL)
|
|
vector_length_clause = NULL;
|
|
|
|
/* If the kernels region had 'num_gangs', 'num_worker', 'vector_length'
|
|
clauses, add these to this new compute construct. */
|
|
clauses
|
|
= add_parent_or_loop_num_clause (num_gangs_clause, loop_gang_clause,
|
|
OMP_CLAUSE_NUM_GANGS, clauses);
|
|
clauses
|
|
= add_parent_or_loop_num_clause (num_workers_clause, loop_worker_clause,
|
|
OMP_CLAUSE_NUM_WORKERS, clauses);
|
|
clauses
|
|
= add_parent_or_loop_num_clause (vector_length_clause, loop_vector_clause,
|
|
OMP_CLAUSE_VECTOR_LENGTH, clauses);
|
|
|
|
return clauses;
|
|
}
|
|
|
|
/* Construct a possibly gang-parallel compute construct containing the STMT,
|
|
which must be identical to, or a bind containing, the loop OMP_FOR.
|
|
|
|
The NUM_GANGS_CLAUSE, NUM_WORKERS_CLAUSE, and VECTOR_LENGTH_CLAUSE are
|
|
optional clauses from the original kernels region and must not be contained
|
|
in the other CLAUSES. The newly created compute construct is annotated with
|
|
the optional NUM_GANGS_CLAUSE as well as the other CLAUSES. If there is no
|
|
NUM_GANGS_CLAUSE but the loop has a 'gang (num: N)' clause, that is
|
|
converted to a 'num_gangs (N)' clause on the new compute construct, and
|
|
similarly for 'worker' and 'vector' clauses.
|
|
|
|
The outermost loop gets an 'auto' clause unless there already is an
|
|
'seq'/'independent'/'auto' clause. Nested loops inside OMP_FOR are treated
|
|
similarly by the adjust_nested_loop_clauses function. */
|
|
|
|
static gimple *
|
|
make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
|
|
tree num_gangs_clause,
|
|
tree num_workers_clause,
|
|
tree vector_length_clause,
|
|
tree clauses)
|
|
{
|
|
/* This correctly unshares the entire clause chain rooted here. */
|
|
clauses = unshare_expr (clauses);
|
|
|
|
/* Figure out the region code for this region. */
|
|
/* Optimistic default: assume that the loop nest is parallelizable
|
|
(essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
|
|
and no un-annotated loops). */
|
|
int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
|
|
adjust_region_code (stmts, ®ion_code);
|
|
|
|
if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
|
|
{
|
|
if (dump_enabled_p ())
|
|
/* This is not MSG_OPTIMIZED_LOCATIONS, as we're just doing what the
|
|
user asked us to. */
|
|
dump_printf_loc (MSG_NOTE, omp_for,
|
|
"parallelized loop nest"
|
|
" in OpenACC %<kernels%> region\n");
|
|
|
|
clauses = transform_kernels_loop_clauses (omp_for,
|
|
num_gangs_clause,
|
|
num_workers_clause,
|
|
vector_length_clause,
|
|
clauses);
|
|
}
|
|
else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_NOTE, omp_for,
|
|
"forwarded loop nest"
|
|
" in OpenACC %<kernels%> region"
|
|
" to %<parloops%> for analysis\n");
|
|
|
|
/* We're transforming one 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
|
|
'GF_OMP_TARGET_KIND_OACC_KERNELS', so don't have to
|
|
'transform_kernels_loop_clauses'. */
|
|
/* Re-assemble the clauses stripped off earlier. */
|
|
clauses
|
|
= add_parent_or_loop_num_clause (num_gangs_clause, NULL,
|
|
OMP_CLAUSE_NUM_GANGS, clauses);
|
|
clauses
|
|
= add_parent_or_loop_num_clause (num_workers_clause, NULL,
|
|
OMP_CLAUSE_NUM_WORKERS, clauses);
|
|
clauses
|
|
= add_parent_or_loop_num_clause (vector_length_clause, NULL,
|
|
OMP_CLAUSE_VECTOR_LENGTH, clauses);
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
|
|
gimple *parallel_body_bind
|
|
= gimple_build_bind (NULL, stmts, make_node (BLOCK));
|
|
gimple *parallel_region
|
|
= gimple_build_omp_target (parallel_body_bind, region_code, clauses);
|
|
gimple_set_location (parallel_region, gimple_location (omp_for));
|
|
|
|
return parallel_region;
|
|
}
|
|
|
|
/* Eliminate any binds directly inside BIND by adding their statements to
|
|
BIND (i.e., modifying it in place), excluding binds that hold only an
|
|
OMP_FOR loop and associated setup/cleanup code. Recurse into binds but
|
|
not other statements. Return a chain of the local variables of eliminated
|
|
binds, i.e., the local variables found in nested binds. If
|
|
INCLUDE_TOPLEVEL_VARS is true, this also includes the variables belonging
|
|
to BIND itself. */
|
|
|
|
static tree
|
|
flatten_binds (gbind *bind, bool include_toplevel_vars = false)
|
|
{
|
|
tree vars = NULL, last_var = NULL;
|
|
|
|
if (include_toplevel_vars)
|
|
{
|
|
vars = gimple_bind_vars (bind);
|
|
last_var = vars;
|
|
}
|
|
|
|
gimple_seq new_body = NULL;
|
|
gimple_seq body_sequence = gimple_bind_body (bind);
|
|
gimple_stmt_iterator gsi, gsi_n;
|
|
for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n)
|
|
{
|
|
/* Advance the iterator here because otherwise it would be invalidated
|
|
by moving statements below. */
|
|
gsi_n = gsi;
|
|
gsi_next (&gsi_n);
|
|
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
/* Flatten bind statements, except the ones that contain only an
|
|
OpenACC for loop. */
|
|
if (gimple_code (stmt) == GIMPLE_BIND
|
|
&& !top_level_omp_for_in_stmt (stmt))
|
|
{
|
|
gbind *inner_bind = as_a <gbind *> (stmt);
|
|
/* Flatten recursively, and collect all variables. */
|
|
tree inner_vars = flatten_binds (inner_bind, true);
|
|
gimple_seq inner_sequence = gimple_bind_body (inner_bind);
|
|
if (flag_checking)
|
|
{
|
|
for (gimple_stmt_iterator inner_gsi = gsi_start (inner_sequence);
|
|
!gsi_end_p (inner_gsi);
|
|
gsi_next (&inner_gsi))
|
|
{
|
|
gimple *inner_stmt = gsi_stmt (inner_gsi);
|
|
gcc_assert (gimple_code (inner_stmt) != GIMPLE_BIND
|
|
|| top_level_omp_for_in_stmt (inner_stmt));
|
|
}
|
|
}
|
|
gimple_seq_add_seq (&new_body, inner_sequence);
|
|
/* Find the last variable; we will append others to it. */
|
|
while (last_var != NULL && TREE_CHAIN (last_var) != NULL)
|
|
last_var = TREE_CHAIN (last_var);
|
|
if (last_var != NULL)
|
|
{
|
|
TREE_CHAIN (last_var) = inner_vars;
|
|
last_var = inner_vars;
|
|
}
|
|
else
|
|
{
|
|
vars = inner_vars;
|
|
last_var = vars;
|
|
}
|
|
}
|
|
else
|
|
gimple_seq_add_stmt (&new_body, stmt);
|
|
}
|
|
|
|
/* Put the possibly transformed body back into the bind. */
|
|
gimple_bind_set_body (bind, new_body);
|
|
return vars;
|
|
}
|
|
|
|
/* Helper function for places where we construct data regions. Wraps the BODY
|
|
inside a try-finally construct at LOC that calls __builtin_GOACC_data_end
|
|
in its cleanup block. Returns this try statement. */
|
|
|
|
static gimple *
|
|
make_data_region_try_statement (location_t loc, gimple *body)
|
|
{
|
|
tree data_end_fn = builtin_decl_explicit (BUILT_IN_GOACC_DATA_END);
|
|
gimple *call = gimple_build_call (data_end_fn, 0);
|
|
gimple_seq cleanup = NULL;
|
|
gimple_seq_add_stmt (&cleanup, call);
|
|
gimple *try_stmt = gimple_build_try (body, cleanup, GIMPLE_TRY_FINALLY);
|
|
gimple_set_location (body, loc);
|
|
return try_stmt;
|
|
}
|
|
|
|
/* If INNER_BIND_VARS holds variables, build an OpenACC data region with
|
|
location LOC containing BODY and having 'create (var)' clauses for each
|
|
variable. If INNER_CLEANUP is present, add a try-finally statement with
|
|
this cleanup code in the finally block. Return the new data region, or
|
|
the original BODY if no data region was needed. */
|
|
|
|
static gimple *
|
|
maybe_build_inner_data_region (location_t loc, gimple *body,
|
|
tree inner_bind_vars, gimple *inner_cleanup)
|
|
{
|
|
/* Is this an instantiation of a template? (In this case, we don't care what
|
|
the generic decl is - just whether the function decl has one.) */
|
|
bool generic_inst_p
|
|
= (lang_hooks.decls.get_generic_function_decl (current_function_decl)
|
|
!= NULL);
|
|
|
|
/* Build data 'create (var)' clauses for these local variables.
|
|
Below we will add these to a data region enclosing the entire body
|
|
of the decomposed kernels region. */
|
|
tree prev_mapped_var = NULL, next = NULL, artificial_vars = NULL,
|
|
inner_data_clauses = NULL;
|
|
for (tree v = inner_bind_vars; v; v = next)
|
|
{
|
|
next = TREE_CHAIN (v);
|
|
if (DECL_ARTIFICIAL (v)
|
|
|| TREE_CODE (v) == CONST_DECL
|
|
|| generic_inst_p)
|
|
{
|
|
/* If this is an artificial temporary, it need not be mapped. We
|
|
move its declaration into the bind inside the data region.
|
|
Also avoid mapping variables if we are inside a template
|
|
instantiation; the code does not contain all the copies to
|
|
temporaries that would make this legal. */
|
|
TREE_CHAIN (v) = artificial_vars;
|
|
artificial_vars = v;
|
|
if (prev_mapped_var != NULL)
|
|
TREE_CHAIN (prev_mapped_var) = next;
|
|
else
|
|
inner_bind_vars = next;
|
|
}
|
|
else
|
|
{
|
|
/* Otherwise, build the map clause. */
|
|
tree new_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
|
|
OMP_CLAUSE_SET_MAP_KIND (new_clause, GOMP_MAP_ALLOC);
|
|
OMP_CLAUSE_DECL (new_clause) = v;
|
|
OMP_CLAUSE_SIZE (new_clause) = DECL_SIZE_UNIT (v);
|
|
OMP_CLAUSE_CHAIN (new_clause) = inner_data_clauses;
|
|
inner_data_clauses = new_clause;
|
|
|
|
prev_mapped_var = v;
|
|
}
|
|
}
|
|
|
|
if (artificial_vars)
|
|
body = gimple_build_bind (artificial_vars, body, make_node (BLOCK));
|
|
|
|
/* If we determined above that there are variables that need to be created
|
|
on the device, construct a data region for them and wrap the body
|
|
inside that. */
|
|
if (inner_data_clauses != NULL)
|
|
{
|
|
gcc_assert (inner_bind_vars != NULL);
|
|
gimple *inner_data_region
|
|
= gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
|
|
inner_data_clauses);
|
|
gimple_set_location (inner_data_region, loc);
|
|
/* Make sure __builtin_GOACC_data_end is called at the end. */
|
|
gimple *try_stmt = make_data_region_try_statement (loc, body);
|
|
gimple_omp_set_body (inner_data_region, try_stmt);
|
|
gimple *bind_body;
|
|
if (inner_cleanup != NULL)
|
|
/* Clobber all the inner variables that need to be clobbered. */
|
|
bind_body = gimple_build_try (inner_data_region, inner_cleanup,
|
|
GIMPLE_TRY_FINALLY);
|
|
else
|
|
bind_body = inner_data_region;
|
|
body = gimple_build_bind (inner_bind_vars, bind_body, make_node (BLOCK));
|
|
}
|
|
|
|
return body;
|
|
}
|
|
|
|
/* Helper function of decompose_kernels_region_body. The statements in
|
|
REGION_BODY are expected to be decomposed parts; add an 'async' clause to
|
|
each. Also add a 'wait' directive at the end of the sequence. */
|
|
|
|
static void
|
|
add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
|
|
{
|
|
tree default_async_queue
|
|
= build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
|
|
for (gimple_stmt_iterator gsi = gsi_start (*region_body);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
tree target_clauses = gimple_omp_target_clauses (stmt);
|
|
tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
|
|
OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
|
|
OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
|
|
target_clauses = new_async_clause;
|
|
gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
|
|
target_clauses);
|
|
}
|
|
/* A '#pragma acc wait' is just a call 'GOACC_wait (acc_async_sync, 0)'. */
|
|
tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
|
|
tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
|
|
gimple *wait_call = gimple_build_call (wait_fn, 2,
|
|
sync_arg, integer_zero_node);
|
|
gimple_set_location (wait_call, loc);
|
|
gimple_seq_add_stmt (region_body, wait_call);
|
|
}
|
|
|
|
/* Auxiliary analysis of the body of a kernels region, to determine for each
|
|
OpenACC loop whether it is control-dependent (i.e., not necessarily
|
|
executed every time the kernels region is entered) or not.
|
|
We say that a loop is control-dependent if there is some cond, switch, or
|
|
goto statement that jumps over it, forwards or backwards. For example,
|
|
if the loop is controlled by an if statement, then a jump to the true
|
|
block, the false block, or from one of those blocks to the control flow
|
|
join point will necessarily jump over the loop.
|
|
This analysis implements an ad-hoc union-find data structure classifying
|
|
statements into "control-flow regions" as follows: Most statements are in
|
|
the same region as their predecessor, except that each OpenACC loop is in
|
|
a region of its own, and each OpenACC loop's successor starts a new
|
|
region. We then unite the regions of any statements linked by jumps,
|
|
placing any cond, switch, or goto statement in the same region as its
|
|
target label(s).
|
|
In the end, control dependence of OpenACC loops can be determined by
|
|
comparing their immediate predecessor and successor statements' regions.
|
|
A jump crosses the loop if and only if the predecessor and successor are
|
|
in the same region. (If there is no predecessor or successor, the loop
|
|
is executed unconditionally.)
|
|
The methods in this class identify statements by their index in the
|
|
kernels region's body. */
|
|
|
|
class control_flow_regions
|
|
{
|
|
public:
|
|
/* Initialize an instance and pre-compute the control-flow region
|
|
information for the statement sequence SEQ. */
|
|
control_flow_regions (gimple_seq seq);
|
|
|
|
/* Return true if the statement with the given index IDX in the analyzed
|
|
statement sequence is an unconditionally executed OpenACC loop. */
|
|
bool is_unconditional_oacc_for_loop (size_t idx);
|
|
|
|
private:
|
|
/* Find the region representative for the statement identified by index
|
|
STMT_IDX. */
|
|
size_t find_rep (size_t stmt_idx);
|
|
|
|
/* Union the regions containing the statements represented by
|
|
representatives A and B. */
|
|
void union_reps (size_t a, size_t b);
|
|
|
|
/* Helper for the constructor. Performs the actual computation of the
|
|
control-flow regions in the statement sequence SEQ. */
|
|
void compute_regions (gimple_seq seq);
|
|
|
|
/* The mapping from statement indices to region representatives. */
|
|
vec <size_t> representatives;
|
|
|
|
/* A cache mapping statement indices to a flag indicating whether the
|
|
statement is a top level OpenACC for loop. */
|
|
vec <bool> omp_for_loops;
|
|
};
|
|
|
|
control_flow_regions::control_flow_regions (gimple_seq seq)
|
|
{
|
|
representatives.create (1);
|
|
omp_for_loops.create (1);
|
|
compute_regions (seq);
|
|
}
|
|
|
|
bool
|
|
control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
|
|
{
|
|
if (idx == 0 || idx == representatives.length () - 1)
|
|
/* The first or last statement in the kernels region. This means that
|
|
there is no room before or after it for a jump or a label. Thus
|
|
there cannot be a jump across it, so it is unconditional. */
|
|
return true;
|
|
/* Otherwise, the loop is unconditional if the statements before and after
|
|
it are in different control flow regions. Scan forward and backward,
|
|
skipping over neighboring OpenACC for loops, to find these preceding
|
|
statements. */
|
|
size_t prev_index = idx - 1;
|
|
while (prev_index > 0 && omp_for_loops [prev_index] == true)
|
|
prev_index--;
|
|
/* If all preceding statements are also OpenACC loops, all of these are
|
|
unconditional. */
|
|
if (prev_index == 0)
|
|
return true;
|
|
size_t succ_index = idx + 1;
|
|
while (succ_index < omp_for_loops.length ()
|
|
&& omp_for_loops [succ_index] == true)
|
|
succ_index++;
|
|
/* If all following statements are also OpenACC loops, all of these are
|
|
unconditional. */
|
|
if (succ_index == omp_for_loops.length ())
|
|
return true;
|
|
return (find_rep (prev_index) != find_rep (succ_index));
|
|
}
|
|
|
|
size_t
|
|
control_flow_regions::find_rep (size_t stmt_idx)
|
|
{
|
|
size_t rep = stmt_idx, aux = stmt_idx;
|
|
/* Find the root representative of this statement. */
|
|
while (representatives[rep] != rep)
|
|
rep = representatives[rep];
|
|
/* Compress the path from the original statement to the representative. */
|
|
while (representatives[aux] != rep)
|
|
{
|
|
size_t tmp = representatives[aux];
|
|
representatives[aux] = rep;
|
|
aux = tmp;
|
|
}
|
|
return rep;
|
|
}
|
|
|
|
void
|
|
control_flow_regions::union_reps (size_t a, size_t b)
|
|
{
|
|
a = find_rep (a);
|
|
b = find_rep (b);
|
|
representatives[b] = a;
|
|
}
|
|
|
|
void
|
|
control_flow_regions::compute_regions (gimple_seq seq)
|
|
{
|
|
hash_map <gimple *, size_t> control_flow_reps;
|
|
hash_map <tree, size_t> label_reps;
|
|
size_t current_region = 0, idx = 0;
|
|
|
|
/* In a first pass, assign an initial region to each statement. Except in
|
|
the case of OpenACC loops, each statement simply gets the same region
|
|
representative as its predecessor. */
|
|
for (gimple_stmt_iterator gsi = gsi_start (seq);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
gimple *omp_for = top_level_omp_for_in_stmt (stmt);
|
|
omp_for_loops.safe_push (omp_for != NULL);
|
|
if (omp_for != NULL)
|
|
{
|
|
/* Assign a new region to this loop and to its successor. */
|
|
current_region = idx;
|
|
representatives.safe_push (current_region);
|
|
current_region++;
|
|
}
|
|
else
|
|
{
|
|
representatives.safe_push (current_region);
|
|
/* Remember any jumps and labels for the second pass below. */
|
|
if (gimple_code (stmt) == GIMPLE_COND
|
|
|| gimple_code (stmt) == GIMPLE_SWITCH
|
|
|| gimple_code (stmt) == GIMPLE_GOTO)
|
|
control_flow_reps.put (stmt, current_region);
|
|
else if (gimple_code (stmt) == GIMPLE_LABEL)
|
|
label_reps.put (gimple_label_label (as_a <glabel *> (stmt)),
|
|
current_region);
|
|
}
|
|
idx++;
|
|
}
|
|
gcc_assert (representatives.length () == omp_for_loops.length ());
|
|
|
|
/* Revisit all the control flow statements and union the region of each
|
|
cond, switch, or goto statement with the target labels' regions. */
|
|
for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin ();
|
|
it != control_flow_reps.end ();
|
|
++it)
|
|
{
|
|
gimple *stmt = (*it).first;
|
|
size_t stmt_rep = (*it).second;
|
|
switch (gimple_code (stmt))
|
|
{
|
|
tree label;
|
|
unsigned int n;
|
|
|
|
case GIMPLE_COND:
|
|
label = gimple_cond_true_label (as_a <gcond *> (stmt));
|
|
union_reps (stmt_rep, *label_reps.get (label));
|
|
label = gimple_cond_false_label (as_a <gcond *> (stmt));
|
|
union_reps (stmt_rep, *label_reps.get (label));
|
|
break;
|
|
|
|
case GIMPLE_SWITCH:
|
|
n = gimple_switch_num_labels (as_a <gswitch *> (stmt));
|
|
for (unsigned int i = 0; i < n; i++)
|
|
{
|
|
tree switch_case
|
|
= gimple_switch_label (as_a <gswitch *> (stmt), i);
|
|
label = CASE_LABEL (switch_case);
|
|
union_reps (stmt_rep, *label_reps.get (label));
|
|
}
|
|
break;
|
|
|
|
case GIMPLE_GOTO:
|
|
label = gimple_goto_dest (stmt);
|
|
union_reps (stmt_rep, *label_reps.get (label));
|
|
break;
|
|
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Decompose the body of the KERNELS_REGION, which was originally annotated
|
|
with the KERNELS_CLAUSES, into a series of compute constructs. */
|
|
|
|
static gimple *
|
|
decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
|
|
{
|
|
location_t loc = gimple_location (kernels_region);
|
|
|
|
/* The kernels clauses will be propagated to the child clauses unmodified,
|
|
except that the 'num_gangs', 'num_workers', and 'vector_length' clauses
|
|
will only be added to loop regions. The other regions are "gang-single"
|
|
and get an explicit 'num_gangs (1)' clause. So separate out the
|
|
'num_gangs', 'num_workers', and 'vector_length' clauses here.
|
|
Also check for the presence of an 'async' clause but do not remove it from
|
|
the 'kernels' clauses. */
|
|
tree num_gangs_clause = NULL, num_workers_clause = NULL,
|
|
vector_length_clause = NULL;
|
|
tree async_clause = NULL;
|
|
tree prev_clause = NULL, next_clause = NULL;
|
|
tree parallel_clauses = kernels_clauses;
|
|
for (tree c = parallel_clauses; c; c = next_clause)
|
|
{
|
|
/* Preserve this here, as we might NULL it later. */
|
|
next_clause = OMP_CLAUSE_CHAIN (c);
|
|
|
|
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS
|
|
|| OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS
|
|
|| OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
|
|
{
|
|
/* Cut this clause out of the chain. */
|
|
if (prev_clause != NULL)
|
|
OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (c);
|
|
else
|
|
kernels_clauses = OMP_CLAUSE_CHAIN (c);
|
|
OMP_CLAUSE_CHAIN (c) = NULL;
|
|
switch (OMP_CLAUSE_CODE (c))
|
|
{
|
|
case OMP_CLAUSE_NUM_GANGS:
|
|
num_gangs_clause = c;
|
|
break;
|
|
case OMP_CLAUSE_NUM_WORKERS:
|
|
num_workers_clause = c;
|
|
break;
|
|
case OMP_CLAUSE_VECTOR_LENGTH:
|
|
vector_length_clause = c;
|
|
break;
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
}
|
|
else
|
|
prev_clause = c;
|
|
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
|
|
async_clause = c;
|
|
}
|
|
|
|
gimple *kernels_body = gimple_omp_body (kernels_region);
|
|
gbind *kernels_bind = as_a <gbind *> (kernels_body);
|
|
|
|
/* The body of the region may contain other nested binds declaring inner
|
|
local variables. Collapse all these binds into one to ensure that we
|
|
have a single sequence of statements to iterate over; also, collect all
|
|
inner variables. */
|
|
tree inner_bind_vars = flatten_binds (kernels_bind);
|
|
gimple_seq body_sequence = gimple_bind_body (kernels_bind);
|
|
|
|
/* All these inner variables will get allocated on the device (below, by
|
|
calling maybe_build_inner_data_region). Here we create 'present'
|
|
clauses for them and add these clauses to the list of clauses to be
|
|
attached to each inner compute construct. */
|
|
tree present_clauses = kernels_clauses;
|
|
for (tree var = inner_bind_vars; var; var = TREE_CHAIN (var))
|
|
{
|
|
if (!DECL_ARTIFICIAL (var) && TREE_CODE (var) != CONST_DECL)
|
|
{
|
|
tree present_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
|
|
OMP_CLAUSE_SET_MAP_KIND (present_clause, GOMP_MAP_FORCE_PRESENT);
|
|
OMP_CLAUSE_DECL (present_clause) = var;
|
|
OMP_CLAUSE_SIZE (present_clause) = DECL_SIZE_UNIT (var);
|
|
OMP_CLAUSE_CHAIN (present_clause) = present_clauses;
|
|
present_clauses = present_clause;
|
|
}
|
|
}
|
|
kernels_clauses = present_clauses;
|
|
|
|
/* In addition to nested binds, the "real" body of the region may be
|
|
nested inside a try-finally block. Find its cleanup block, which
|
|
contains code to clobber the local variables that must be clobbered. */
|
|
gimple *inner_cleanup = NULL;
|
|
if (body_sequence != NULL && gimple_code (body_sequence) == GIMPLE_TRY)
|
|
{
|
|
if (gimple_seq_singleton_p (body_sequence))
|
|
{
|
|
/* The try statement is the only thing inside the bind. */
|
|
inner_cleanup = gimple_try_cleanup (body_sequence);
|
|
body_sequence = gimple_try_eval (body_sequence);
|
|
}
|
|
else
|
|
{
|
|
/* The bind's body starts with a try statement, but it is followed
|
|
by other things. */
|
|
gimple_stmt_iterator gsi = gsi_start (body_sequence);
|
|
gimple *try_stmt = gsi_stmt (gsi);
|
|
inner_cleanup = gimple_try_cleanup (try_stmt);
|
|
gimple *try_body = gimple_try_eval (try_stmt);
|
|
|
|
gsi_remove (&gsi, false);
|
|
/* Now gsi indicates the sequence of statements after the try
|
|
statement in the bind. Append the statement in the try body and
|
|
the trailing statements from gsi. */
|
|
gsi_insert_seq_before (&gsi, try_body, GSI_CONTINUE_LINKING);
|
|
body_sequence = gsi_stmt (gsi);
|
|
}
|
|
}
|
|
|
|
/* This sequence will collect all the top-level statements in the body of
|
|
the data region we are about to construct. */
|
|
gimple_seq region_body = NULL;
|
|
/* This sequence will collect consecutive statements to be put into a
|
|
gang-single region. */
|
|
gimple_seq gang_single_seq = NULL;
|
|
/* Flag recording whether the gang_single_seq only contains copies to
|
|
local variables. These may be loop setup code that should not be
|
|
separated from the loop. */
|
|
bool only_simple_assignments = true;
|
|
|
|
/* Precompute the control flow region information to determine whether an
|
|
OpenACC loop is executed conditionally or unconditionally. */
|
|
control_flow_regions cf_regions (body_sequence);
|
|
|
|
/* Iterate over the statements in the kernels region's body. */
|
|
size_t idx = 0;
|
|
gimple_stmt_iterator gsi, gsi_n;
|
|
for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n, idx++)
|
|
{
|
|
/* Advance the iterator here because otherwise it would be invalidated
|
|
by moving statements below. */
|
|
gsi_n = gsi;
|
|
gsi_next (&gsi_n);
|
|
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
gimple *omp_for = top_level_omp_for_in_stmt (stmt);
|
|
bool is_unconditional_oacc_for_loop = false;
|
|
if (omp_for != NULL)
|
|
is_unconditional_oacc_for_loop
|
|
= cf_regions.is_unconditional_oacc_for_loop (idx);
|
|
if (omp_for != NULL
|
|
&& is_unconditional_oacc_for_loop)
|
|
{
|
|
/* This is an OMP for statement, put it into a separate region.
|
|
But first, construct a gang-single region containing any
|
|
complex sequential statements we may have seen. */
|
|
if (gang_single_seq != NULL && !only_simple_assignments)
|
|
{
|
|
gimple *single_region
|
|
= make_region_seq (loc, gang_single_seq,
|
|
num_gangs_clause,
|
|
num_workers_clause,
|
|
vector_length_clause,
|
|
kernels_clauses);
|
|
gimple_seq_add_stmt (®ion_body, single_region);
|
|
}
|
|
else if (gang_single_seq != NULL && only_simple_assignments)
|
|
{
|
|
/* There is a sequence of sequential statements preceding this
|
|
loop, but they are all simple assignments. This is
|
|
probably setup code for the loop; in particular, Fortran DO
|
|
loops are preceded by code to copy the loop limit variable
|
|
to a temporary. Group this code together with the loop
|
|
itself. */
|
|
gimple_seq_add_stmt (&gang_single_seq, stmt);
|
|
stmt = gimple_build_bind (NULL, gang_single_seq,
|
|
make_node (BLOCK));
|
|
}
|
|
gang_single_seq = NULL;
|
|
only_simple_assignments = true;
|
|
|
|
gimple_seq parallel_seq = NULL;
|
|
gimple_seq_add_stmt (¶llel_seq, stmt);
|
|
gimple *parallel_region
|
|
= make_region_loop_nest (omp_for, parallel_seq,
|
|
num_gangs_clause,
|
|
num_workers_clause,
|
|
vector_length_clause,
|
|
kernels_clauses);
|
|
gimple_seq_add_stmt (®ion_body, parallel_region);
|
|
}
|
|
else
|
|
{
|
|
if (omp_for != NULL)
|
|
{
|
|
gcc_checking_assert (!is_unconditional_oacc_for_loop);
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, omp_for,
|
|
"unparallelized loop nest"
|
|
" in OpenACC %<kernels%> region:"
|
|
" it's executed conditionally\n");
|
|
}
|
|
|
|
/* This is not an unconditional OMP for statement, so it will be
|
|
put into a gang-single region. */
|
|
gimple_seq_add_stmt (&gang_single_seq, stmt);
|
|
/* Is this a simple assignment? We call it simple if it is an
|
|
assignment to an artificial local variable. This captures
|
|
Fortran loop setup code computing loop bounds and offsets. */
|
|
bool is_simple_assignment
|
|
= (gimple_code (stmt) == GIMPLE_ASSIGN
|
|
&& TREE_CODE (gimple_assign_lhs (stmt)) == VAR_DECL
|
|
&& DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
|
|
if (!is_simple_assignment)
|
|
only_simple_assignments = false;
|
|
}
|
|
}
|
|
|
|
/* If we did not emit a new region, and are not going to emit one now
|
|
(that is, the original region was empty), prepare to emit a dummy so as
|
|
to preserve the original construct, which other processing (at least
|
|
test cases) depend on. */
|
|
if (region_body == NULL && gang_single_seq == NULL)
|
|
{
|
|
gimple *stmt = gimple_build_nop ();
|
|
gimple_set_location (stmt, loc);
|
|
gimple_seq_add_stmt (&gang_single_seq, stmt);
|
|
}
|
|
|
|
/* Gather up any remaining gang-single statements. */
|
|
if (gang_single_seq != NULL)
|
|
{
|
|
gimple *single_region
|
|
= make_region_seq (loc, gang_single_seq,
|
|
num_gangs_clause,
|
|
num_workers_clause,
|
|
vector_length_clause,
|
|
kernels_clauses);
|
|
gimple_seq_add_stmt (®ion_body, single_region);
|
|
}
|
|
|
|
/* We want to launch these kernels asynchronously. If the original
|
|
kernels region had an async clause, this is done automatically because
|
|
that async clause was copied to the individual regions we created.
|
|
Otherwise, add an async clause to each newly created region, as well as
|
|
a wait directive at the end. */
|
|
if (async_clause == NULL)
|
|
add_async_clauses_and_wait (loc, ®ion_body);
|
|
|
|
tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
|
|
gimple *body = gimple_build_bind (kernels_locals, region_body,
|
|
make_node (BLOCK));
|
|
|
|
/* If we found variables declared in nested scopes, build a data region to
|
|
map them to the device. */
|
|
body = maybe_build_inner_data_region (loc, body, inner_bind_vars,
|
|
inner_cleanup);
|
|
|
|
return body;
|
|
}
|
|
|
|
/* Decompose one OpenACC 'kernels' construct into an OpenACC 'data' construct
|
|
containing the original OpenACC 'kernels' construct's region cut up into a
|
|
sequence of compute constructs. */
|
|
|
|
static gimple *
|
|
omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
|
|
{
|
|
gcc_checking_assert (gimple_omp_target_kind (kernels_stmt)
|
|
== GF_OMP_TARGET_KIND_OACC_KERNELS);
|
|
location_t loc = gimple_location (kernels_stmt);
|
|
|
|
/* Collect the data clauses of the OpenACC 'kernels' directive and create a
|
|
new OpenACC 'data' construct with those clauses. */
|
|
tree kernels_clauses = gimple_omp_target_clauses (kernels_stmt);
|
|
tree data_clauses = NULL;
|
|
for (tree c = kernels_clauses; c; c = OMP_CLAUSE_CHAIN (c))
|
|
{
|
|
/* Certain clauses are copied to the enclosing OpenACC 'data'. Other
|
|
clauses remain on the OpenACC 'kernels'. */
|
|
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
|
|
{
|
|
tree decl = OMP_CLAUSE_DECL (c);
|
|
HOST_WIDE_INT map_kind = OMP_CLAUSE_MAP_KIND (c);
|
|
switch (map_kind)
|
|
{
|
|
default:
|
|
if (map_kind == GOMP_MAP_ALLOC
|
|
&& integer_zerop (OMP_CLAUSE_SIZE (c)))
|
|
/* ??? This is an alloc clause for mapping a pointer whose
|
|
target is already mapped. We leave these on the inner
|
|
compute constructs because moving them to the outer data
|
|
region causes runtime errors. */
|
|
break;
|
|
|
|
/* For non-artificial variables, and for non-declaration
|
|
expressions like A[0:n], copy the clause to the data
|
|
region. */
|
|
if ((DECL_P (decl) && !DECL_ARTIFICIAL (decl))
|
|
|| !DECL_P (decl))
|
|
{
|
|
tree new_clause = build_omp_clause (OMP_CLAUSE_LOCATION (c),
|
|
OMP_CLAUSE_MAP);
|
|
OMP_CLAUSE_SET_MAP_KIND (new_clause, map_kind);
|
|
/* This must be unshared here to avoid "incorrect sharing
|
|
of tree nodes" errors from verify_gimple. */
|
|
OMP_CLAUSE_DECL (new_clause) = unshare_expr (decl);
|
|
OMP_CLAUSE_SIZE (new_clause) = OMP_CLAUSE_SIZE (c);
|
|
OMP_CLAUSE_CHAIN (new_clause) = data_clauses;
|
|
data_clauses = new_clause;
|
|
|
|
/* Now that this data is mapped, turn the data clause on the
|
|
inner OpenACC 'kernels' into a 'present' clause. */
|
|
OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT);
|
|
}
|
|
break;
|
|
|
|
case GOMP_MAP_POINTER:
|
|
case GOMP_MAP_TO_PSET:
|
|
case GOMP_MAP_FORCE_TOFROM:
|
|
case GOMP_MAP_FIRSTPRIVATE_POINTER:
|
|
case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
|
|
/* ??? Copying these map kinds leads to internal compiler
|
|
errors in later passes. */
|
|
break;
|
|
}
|
|
}
|
|
else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_IF)
|
|
{
|
|
/* If there is an 'if' clause, it must be duplicated to the
|
|
enclosing data region. Temporarily remove the if clause's
|
|
chain to avoid copying it. */
|
|
tree saved_chain = OMP_CLAUSE_CHAIN (c);
|
|
OMP_CLAUSE_CHAIN (c) = NULL;
|
|
tree new_if_clause = unshare_expr (c);
|
|
OMP_CLAUSE_CHAIN (c) = saved_chain;
|
|
OMP_CLAUSE_CHAIN (new_if_clause) = data_clauses;
|
|
data_clauses = new_if_clause;
|
|
}
|
|
}
|
|
/* Restore the original order of the clauses. */
|
|
data_clauses = nreverse (data_clauses);
|
|
|
|
gimple *data_region
|
|
= gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
|
|
data_clauses);
|
|
gimple_set_location (data_region, loc);
|
|
|
|
/* Transform the body of the kernels region into a sequence of compute
|
|
constructs. */
|
|
gimple *body = decompose_kernels_region_body (kernels_stmt,
|
|
kernels_clauses);
|
|
|
|
/* Put the transformed pieces together. The entire body of the region is
|
|
wrapped in a try-finally statement that calls __builtin_GOACC_data_end
|
|
for cleanup. */
|
|
gimple *try_stmt = make_data_region_try_statement (loc, body);
|
|
gimple_omp_set_body (data_region, try_stmt);
|
|
|
|
return data_region;
|
|
}
|
|
|
|
|
|
/* Decompose OpenACC 'kernels' constructs in the current function. */
|
|
|
|
static tree
|
|
omp_oacc_kernels_decompose_callback_stmt (gimple_stmt_iterator *gsi_p,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *)
|
|
{
|
|
gimple *stmt = gsi_stmt (*gsi_p);
|
|
|
|
if ((gimple_code (stmt) == GIMPLE_OMP_TARGET)
|
|
&& gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
|
|
{
|
|
gimple *stmt_new = omp_oacc_kernels_decompose_1 (stmt);
|
|
gsi_replace (gsi_p, stmt_new, false);
|
|
*handled_ops_p = true;
|
|
}
|
|
else
|
|
*handled_ops_p = false;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static unsigned int
|
|
omp_oacc_kernels_decompose (void)
|
|
{
|
|
gimple_seq body = gimple_body (current_function_decl);
|
|
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
walk_gimple_seq_mod (&body, omp_oacc_kernels_decompose_callback_stmt, NULL,
|
|
&wi);
|
|
|
|
gimple_set_body (current_function_decl, body);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_omp_oacc_kernels_decompose =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"omp_oacc_kernels_decompose", /* name */
|
|
OPTGROUP_OMP, /* optinfo_flags */
|
|
TV_NONE, /* tv_id */
|
|
PROP_gimple_any, /* properties_required */
|
|
0, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
0, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_omp_oacc_kernels_decompose : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_omp_oacc_kernels_decompose, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *)
|
|
{
|
|
return (flag_openacc
|
|
&& flag_openacc_kernels == OPENACC_KERNELS_DECOMPOSE);
|
|
}
|
|
virtual unsigned int execute (function *)
|
|
{
|
|
return omp_oacc_kernels_decompose ();
|
|
}
|
|
|
|
}; // class pass_omp_oacc_kernels_decompose
|
|
|
|
} // anon namespace
|
|
|
|
gimple_opt_pass *
|
|
make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
|
|
{
|
|
return new pass_omp_oacc_kernels_decompose (ctxt);
|
|
}
|