4f5b9c803a
gcc/ChangeLog: * cfgloop.c (get_loop_location): Convert return type from location_t to dump_user_location_t, replacing INSN_LOCATION lookups by implicit construction from rtx_insn *, and using dump_user_location_t::from_function_decl for the fallback case. * cfgloop.h (get_loop_location): Convert return type from location_t to dump_user_location_t. * cgraphunit.c (walk_polymorphic_call_targets): Update call to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the gimple stmt. * coverage.c (get_coverage_counts): Update calls to dump_printf_loc to pass in dump_location_t rather than a location_t. * doc/optinfo.texi (Dump types): Convert example of dump_printf_loc from taking "locus" to taking "insn". Update description of the "_loc" calls to cover dump_location_t. * dumpfile.c: Include "backend.h", "gimple.h", "rtl.h", and "selftest.h". (dump_user_location_t::dump_user_location_t): New constructors, from gimple *stmt and rtx_insn *. (dump_user_location_t::from_function_decl): New function. (dump_loc): Make static. (dump_gimple_stmt_loc): Convert param "loc" from location_t to const dump_location_t &. (dump_generic_expr_loc): Delete. (dump_printf_loc): Convert param "loc" from location_t to const dump_location_t &. (selftest::test_impl_location): New function. (selftest::dumpfile_c_tests): New function. * dumpfile.h: Include "profile-count.h". (class dump_user_location_t): New class. (struct dump_impl_location_t): New struct. (class dump_location_t): New class. (dump_printf_loc): Convert 2nd param from source_location to const dump_location_t &. (dump_generic_expr_loc): Delete. (dump_gimple_stmt_loc): Convert 2nd param from source_location to const dump_location_t &. * gimple-fold.c (fold_gimple_assign): Update call to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the gimple stmt. (gimple_fold_call): Likewise. * gimple-loop-interchange.cc (loop_cand::analyze_iloop_reduction_var): Update for change to check_reduction_path. (tree_loop_interchange::interchange): Update for change to find_loop_location. * graphite-isl-ast-to-gimple.c (scop_to_isl_ast): Update for change in return-type of find_loop_location. (graphite_regenerate_ast_isl): Likewise. * graphite-optimize-isl.c (optimize_isl): Likewise. * graphite.c (graphite_transform_loops): Likewise. * ipa-devirt.c (ipa_devirt): Update call to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the gimple stmt. * ipa-prop.c (ipa_make_edge_direct_to_target): Likewise. * ipa.c (walk_polymorphic_call_targets): Likewise. * loop-unroll.c (report_unroll): Convert "locus" param from location_t to dump_location_t. (decide_unrolling): Update for change to get_loop_location's return type. * omp-grid.c (struct grid_prop): Convert field "target_loc" from location_t to dump_user_location_t. (grid_find_single_omp_among_assignments_1): Updates calls to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the gimple stmt. (grid_parallel_clauses_gridifiable): Convert "tloc" from location_t to dump_location_t. Updates calls to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the gimple stmt. (grid_inner_loop_gridifiable_p): Likewise. (grid_dist_follows_simple_pattern): Likewise. (grid_gfor_follows_tiling_pattern): Likewise. (grid_target_follows_gridifiable_pattern): Likewise. (grid_attempt_target_gridification): Convert initialization of local "grid" from memset to zero-initialization; FIXME: does this require C++11? Update call to dump_printf_loc to pass in a optinfo_location rather than a location_t, via the gimple stmt. * profile.c (read_profile_edge_counts): Updates call to dump_printf_loc to pass in a dump_location_t rather than a location_t (compute_branch_probabilities): Likewise. * selftest-run-tests.c (selftest::run_tests): Call dumpfile_c_tests. * selftest.h (dumpfile_c_tests): New decl. * tree-loop-distribution.c (pass_loop_distribution::execute): Update for change in return type of find_loop_location. * tree-parloops.c (parallelize_loops): Likewise. * tree-ssa-loop-ivcanon.c (try_unroll_loop_completely): Convert "locus" from location_t to dump_user_location_t. (canonicalize_loop_induction_variables): Likewise. * tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize_loop): Update for change in return type of find_loop_location. * tree-ssa-loop-niter.c (number_of_iterations_exit): Update call to dump_printf_loc to pass in a dump_location_t rather than a location_t, via the stmt. * tree-ssa-sccvn.c (eliminate_dom_walker::before_dom_children): Likewise. * tree-vect-loop-manip.c (find_loop_location): Convert return type from source_location to dump_user_location_t. (vect_do_peeling): Update for above change. (vect_loop_versioning): Update for change in type of vect_location. * tree-vect-loop.c (check_reduction_path): Convert "loc" param from location_t to dump_user_location_t. (vect_estimate_min_profitable_iters): Update for change in type of vect_location. * tree-vect-slp.c (vect_print_slp_tree): Convert param "loc" from location_t to dump_location_t. (vect_slp_bb): Update for change in type of vect_location. * tree-vectorizer.c (vect_location): Convert from source_location to dump_user_location_t. (try_vectorize_loop_1): Update for change in vect_location's type. (vectorize_loops): Likewise. (increase_alignment): Likewise. * tree-vectorizer.h (vect_location): Convert from source_location to dump_user_location_t. (find_loop_location): Convert return type from source_location to dump_user_location_t. (check_reduction_path): Convert 1st param from location_t to dump_user_location_t. * value-prof.c (check_counter): Update call to dump_printf_loc to pass in a dump_user_location_t rather than a location_t; update call to error_at for change in type of "locus". (check_ic_target): Update call to dump_printf_loc to pass in a dump_user_location_t rather than a location_t, via the call_stmt. From-SVN: r262149
1414 lines
45 KiB
C
1414 lines
45 KiB
C
/* Lowering and expansion of OpenMP directives for HSA GPU agents.
|
|
|
|
Copyright (C) 2013-2018 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "backend.h"
|
|
#include "tree.h"
|
|
#include "gimple.h"
|
|
#include "tree-pass.h"
|
|
#include "ssa.h"
|
|
#include "cgraph.h"
|
|
#include "pretty-print.h"
|
|
#include "fold-const.h"
|
|
#include "gimplify.h"
|
|
#include "gimple-iterator.h"
|
|
#include "gimple-walk.h"
|
|
#include "tree-inline.h"
|
|
#include "langhooks.h"
|
|
#include "omp-general.h"
|
|
#include "omp-low.h"
|
|
#include "omp-grid.h"
|
|
#include "gimple-pretty-print.h"
|
|
|
|
/* Return the lastprivate predicate for a given gridified loop described by
|
|
FD). */
|
|
|
|
tree
|
|
omp_grid_lastprivate_predicate (struct omp_for_data *fd)
|
|
{
|
|
/* When dealing with a gridified loop, we need to check up to three collapsed
|
|
iteration variables but they are not actually captured in this fd.
|
|
Fortunately, we can easily rely on HSA builtins to get this
|
|
information. */
|
|
|
|
tree id, size;
|
|
if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP
|
|
&& gimple_omp_for_grid_intra_group (fd->for_stmt))
|
|
{
|
|
id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMID);
|
|
size = builtin_decl_explicit (BUILT_IN_HSA_CURRENTWORKGROUPSIZE);
|
|
}
|
|
else
|
|
{
|
|
id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMABSID);
|
|
size = builtin_decl_explicit (BUILT_IN_HSA_GRIDSIZE);
|
|
}
|
|
tree cond = NULL;
|
|
for (int dim = 0; dim < fd->collapse; dim++)
|
|
{
|
|
tree dim_tree = build_int_cstu (unsigned_type_node, dim);
|
|
tree u1 = build_int_cstu (unsigned_type_node, 1);
|
|
tree c2
|
|
= build2 (EQ_EXPR, boolean_type_node,
|
|
build2 (PLUS_EXPR, unsigned_type_node,
|
|
build_call_expr (id, 1, dim_tree), u1),
|
|
build_call_expr (size, 1, dim_tree));
|
|
if (cond)
|
|
cond = build2 (TRUTH_AND_EXPR, boolean_type_node, cond, c2);
|
|
else
|
|
cond = c2;
|
|
}
|
|
return cond;
|
|
}
|
|
|
|
/* Structure describing the basic properties of the loop we ara analyzing
|
|
whether it can be gridified and when it is gridified. */
|
|
|
|
struct grid_prop
|
|
{
|
|
/* True when we are doing tiling gridification, i.e. when there is a distinct
|
|
distribute loop over groups and a loop construct over work-items. False
|
|
when distribute and parallel for loops form a combined construct. */
|
|
bool tiling;
|
|
/* Location of the target construct for optimization information
|
|
messages. */
|
|
dump_user_location_t target_loc;
|
|
/* The collapse clause of the involved loops. Collapse value of all of them
|
|
must be the same for gridification to take place. */
|
|
size_t collapse;
|
|
/* Group sizes, if requested by the user or NULL if not requested. */
|
|
tree group_sizes[3];
|
|
};
|
|
|
|
#define GRID_MISSED_MSG_PREFIX "Will not turn target construct into a " \
|
|
"gridified HSA kernel because "
|
|
|
|
/* Return true if STMT is an assignment of a register-type into a local
|
|
VAR_DECL. If GRID is non-NULL, the assignment additionally must not be to
|
|
any of the trees specifying group sizes there. */
|
|
|
|
static bool
|
|
grid_safe_assignment_p (gimple *stmt, grid_prop *grid)
|
|
{
|
|
gassign *assign = dyn_cast <gassign *> (stmt);
|
|
if (!assign)
|
|
return false;
|
|
if (gimple_clobber_p (assign))
|
|
return true;
|
|
tree lhs = gimple_assign_lhs (assign);
|
|
if (!VAR_P (lhs)
|
|
|| !is_gimple_reg_type (TREE_TYPE (lhs))
|
|
|| is_global_var (lhs))
|
|
return false;
|
|
if (grid)
|
|
for (unsigned i = 0; i < grid->collapse; i++)
|
|
if (lhs == grid->group_sizes[i])
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/* Return true if all statements in SEQ are assignments to local register-type
|
|
variables that do not hold group size information. */
|
|
|
|
static bool
|
|
grid_seq_only_contains_local_assignments (gimple_seq seq, grid_prop *grid)
|
|
{
|
|
if (!seq)
|
|
return true;
|
|
|
|
gimple_stmt_iterator gsi;
|
|
for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
if (!grid_safe_assignment_p (gsi_stmt (gsi), grid))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/* Scan statements in SEQ and call itself recursively on any bind. GRID
|
|
describes hitherto discovered properties of the loop that is evaluated for
|
|
possible gridification. If during whole search only assignments to
|
|
register-type local variables (that do not overwrite group size information)
|
|
and one single OMP statement is encountered, return true, otherwise return
|
|
false. RET is where we store any OMP statement encountered. */
|
|
|
|
static bool
|
|
grid_find_single_omp_among_assignments_1 (gimple_seq seq, grid_prop *grid,
|
|
const char *name, gimple **ret)
|
|
{
|
|
gimple_stmt_iterator gsi;
|
|
for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (grid_safe_assignment_p (stmt, grid))
|
|
continue;
|
|
if (gbind *bind = dyn_cast <gbind *> (stmt))
|
|
{
|
|
gimple_seq bind_body = gimple_bind_body (bind);
|
|
if (!grid_find_single_omp_among_assignments_1 (bind_body, grid, name,
|
|
ret))
|
|
return false;
|
|
}
|
|
else if (is_gimple_omp (stmt))
|
|
{
|
|
if (*ret)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "%s construct "
|
|
"contains multiple OpenMP constructs\n",
|
|
name);
|
|
dump_printf_loc (MSG_NOTE, *ret,
|
|
"The first OpenMP construct within "
|
|
"a parallel\n");
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"The second OpenMP construct within "
|
|
"a parallel\n");
|
|
}
|
|
return false;
|
|
}
|
|
*ret = stmt;
|
|
}
|
|
else
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "%s construct contains "
|
|
"a complex statement\n", name);
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"This statement cannot be analyzed for "
|
|
"gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Scan statements in SEQ and make sure that it and any binds in it contain
|
|
only assignments to local register-type variables (that do not overwrite
|
|
group size information) and one OMP construct. If so, return that
|
|
construct, otherwise return NULL. GRID describes hitherto discovered
|
|
properties of the loop that is evaluated for possible gridification. If
|
|
dumping is enabled and function fails, use NAME to dump a note with the
|
|
reason for failure. */
|
|
|
|
static gimple *
|
|
grid_find_single_omp_among_assignments (gimple_seq seq, grid_prop *grid,
|
|
const char *name)
|
|
{
|
|
if (!seq)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "%s construct has empty body\n",
|
|
name);
|
|
return NULL;
|
|
}
|
|
|
|
gimple *ret = NULL;
|
|
if (grid_find_single_omp_among_assignments_1 (seq, grid, name, &ret))
|
|
{
|
|
if (!ret && dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "%s construct does not contain"
|
|
" any other OpenMP construct\n", name);
|
|
return ret;
|
|
}
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
/* Walker function looking for statements there is no point gridifying (and for
|
|
noreturn function calls which we cannot do). Return non-NULL if such a
|
|
function is found. */
|
|
|
|
static tree
|
|
grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *wi)
|
|
{
|
|
*handled_ops_p = false;
|
|
gimple *stmt = gsi_stmt (*gsi);
|
|
switch (gimple_code (stmt))
|
|
{
|
|
case GIMPLE_CALL:
|
|
if (gimple_call_noreturn_p (as_a <gcall *> (stmt)))
|
|
{
|
|
*handled_ops_p = true;
|
|
wi->info = stmt;
|
|
return error_mark_node;
|
|
}
|
|
break;
|
|
|
|
/* We may reduce the following list if we find a way to implement the
|
|
clauses, but now there is no point trying further. */
|
|
case GIMPLE_OMP_CRITICAL:
|
|
case GIMPLE_OMP_TASKGROUP:
|
|
case GIMPLE_OMP_TASK:
|
|
case GIMPLE_OMP_SECTION:
|
|
case GIMPLE_OMP_SECTIONS:
|
|
case GIMPLE_OMP_SECTIONS_SWITCH:
|
|
case GIMPLE_OMP_TARGET:
|
|
case GIMPLE_OMP_ORDERED:
|
|
*handled_ops_p = true;
|
|
wi->info = stmt;
|
|
return error_mark_node;
|
|
default:
|
|
break;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Examine clauses of omp parallel statement PAR and if any prevents
|
|
gridification, issue a missed-optimization diagnostics and return false,
|
|
otherwise return true. GRID describes hitherto discovered properties of the
|
|
loop that is evaluated for possible gridification. */
|
|
|
|
static bool
|
|
grid_parallel_clauses_gridifiable (gomp_parallel *par, dump_user_location_t tloc)
|
|
{
|
|
tree clauses = gimple_omp_parallel_clauses (par);
|
|
while (clauses)
|
|
{
|
|
switch (OMP_CLAUSE_CODE (clauses))
|
|
{
|
|
case OMP_CLAUSE_NUM_THREADS:
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "because there is "
|
|
"a num_threads clause of the parallel "
|
|
"construct\n");
|
|
dump_printf_loc (MSG_NOTE, par,
|
|
"Parallel construct has a num_threads clause\n");
|
|
}
|
|
return false;
|
|
|
|
case OMP_CLAUSE_REDUCTION:
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "a reduction clause "
|
|
"is present\n ");
|
|
dump_printf_loc (MSG_NOTE, par,
|
|
"Parallel construct has a reduction clause\n");
|
|
}
|
|
return false;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
clauses = OMP_CLAUSE_CHAIN (clauses);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Examine clauses and the body of omp loop statement GFOR and if something
|
|
prevents gridification, issue a missed-optimization diagnostics and return
|
|
false, otherwise return true. GRID describes hitherto discovered properties
|
|
of the loop that is evaluated for possible gridification. */
|
|
|
|
static bool
|
|
grid_inner_loop_gridifiable_p (gomp_for *gfor, grid_prop *grid)
|
|
{
|
|
if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor),
|
|
grid))
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the inner loop "
|
|
"loop bounds computation contains a complex "
|
|
"statement\n");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"Loop construct cannot be analyzed for "
|
|
"gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
tree clauses = gimple_omp_for_clauses (gfor);
|
|
while (clauses)
|
|
{
|
|
switch (OMP_CLAUSE_CODE (clauses))
|
|
{
|
|
case OMP_CLAUSE_SCHEDULE:
|
|
if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the inner loop "
|
|
"has a non-automatic schedule clause\n");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"Loop construct has a non automatic "
|
|
"schedule clause\n");
|
|
}
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
case OMP_CLAUSE_REDUCTION:
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "a reduction "
|
|
"clause is present\n ");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"Loop construct has a reduction schedule "
|
|
"clause\n");
|
|
}
|
|
return false;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
clauses = OMP_CLAUSE_CHAIN (clauses);
|
|
}
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
if (walk_gimple_seq (gimple_omp_body (gfor),
|
|
grid_find_ungridifiable_statement,
|
|
NULL, &wi))
|
|
{
|
|
gimple *bad = (gimple *) wi.info;
|
|
if (dump_enabled_p ())
|
|
{
|
|
if (is_gimple_call (bad))
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the inner loop contains "
|
|
"call to a noreturn function\n");
|
|
else
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the inner loop contains "
|
|
"statement %s which cannot be transformed\n",
|
|
gimple_code_name[(int) gimple_code (bad)]);
|
|
dump_printf_loc (MSG_NOTE, bad,
|
|
"This statement cannot be analyzed for "
|
|
"gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Given distribute omp construct represented by DIST, which in the original
|
|
source forms a compound construct with a looping construct, return true if it
|
|
can be turned into a gridified HSA kernel. Otherwise return false. GRID
|
|
describes hitherto discovered properties of the loop that is evaluated for
|
|
possible gridification. */
|
|
|
|
static bool
|
|
grid_dist_follows_simple_pattern (gomp_for *dist, grid_prop *grid)
|
|
{
|
|
dump_user_location_t tloc = grid->target_loc;
|
|
gimple *stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist),
|
|
grid, "distribute");
|
|
gomp_parallel *par;
|
|
if (!stmt
|
|
|| !(par = dyn_cast <gomp_parallel *> (stmt))
|
|
|| !grid_parallel_clauses_gridifiable (par, tloc))
|
|
return false;
|
|
|
|
stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), grid,
|
|
"parallel");
|
|
gomp_for *gfor;
|
|
if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
|
|
return false;
|
|
|
|
if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "the inner loop is not "
|
|
"a simple for loop\n");
|
|
return false;
|
|
}
|
|
gcc_assert (gimple_omp_for_collapse (gfor) == grid->collapse);
|
|
|
|
if (!grid_inner_loop_gridifiable_p (gfor, grid))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Given an omp loop statement GFOR, return true if it can participate in
|
|
tiling gridification, i.e. in one where the distribute and parallel for
|
|
loops do not form a compound statement. GRID describes hitherto discovered
|
|
properties of the loop that is evaluated for possible gridification. */
|
|
|
|
static bool
|
|
grid_gfor_follows_tiling_pattern (gomp_for *gfor, grid_prop *grid)
|
|
{
|
|
if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "an inner loop is not "
|
|
"a simple for loop\n");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"This statement is not a simple for loop\n");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
if (!grid_inner_loop_gridifiable_p (gfor, grid))
|
|
return false;
|
|
|
|
if (gimple_omp_for_collapse (gfor) != grid->collapse)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "an inner loop does not "
|
|
"have use the same collapse clause\n");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"Loop construct uses a different collapse clause\n");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
struct omp_for_data fd;
|
|
struct omp_for_data_loop *loops
|
|
= (struct omp_for_data_loop *)alloca (grid->collapse
|
|
* sizeof (struct omp_for_data_loop));
|
|
omp_extract_for_data (gfor, &fd, loops);
|
|
for (unsigned i = 0; i < grid->collapse; i++)
|
|
{
|
|
tree itype, type = TREE_TYPE (fd.loops[i].v);
|
|
if (POINTER_TYPE_P (type))
|
|
itype = signed_type_for (type);
|
|
else
|
|
itype = type;
|
|
|
|
tree n1 = fold_convert (itype, fd.loops[i].n1);
|
|
tree n2 = fold_convert (itype, fd.loops[i].n2);
|
|
tree t = build_int_cst (itype,
|
|
(fd.loops[i].cond_code == LT_EXPR ? -1 : 1));
|
|
t = fold_build2 (PLUS_EXPR, itype, fd.loops[i].step, t);
|
|
t = fold_build2 (PLUS_EXPR, itype, t, n2);
|
|
t = fold_build2 (MINUS_EXPR, itype, t, n1);
|
|
if (TYPE_UNSIGNED (itype) && fd.loops[i].cond_code == GT_EXPR)
|
|
t = fold_build2 (TRUNC_DIV_EXPR, itype,
|
|
fold_build1 (NEGATE_EXPR, itype, t),
|
|
fold_build1 (NEGATE_EXPR, itype, fd.loops[i].step));
|
|
else
|
|
t = fold_build2 (TRUNC_DIV_EXPR, itype, t, fd.loops[i].step);
|
|
|
|
if (!operand_equal_p (grid->group_sizes[i], t, 0))
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the distribute and "
|
|
"an internal loop do not agree on tile size\n");
|
|
dump_printf_loc (MSG_NOTE, gfor,
|
|
"Loop construct does not seem to loop over "
|
|
"a tile size\n");
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Facing a call to FNDECL in the body of a distribute construct, return true
|
|
if we can handle it or false if it precludes gridification. */
|
|
|
|
static bool
|
|
grid_call_permissible_in_distribute_p (tree fndecl)
|
|
{
|
|
if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
|
|
return true;
|
|
|
|
const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
|
|
if (strstr (name, "omp_") != name)
|
|
return false;
|
|
|
|
if ((strcmp (name, "omp_get_thread_num") == 0)
|
|
|| (strcmp (name, "omp_get_num_threads") == 0)
|
|
|| (strcmp (name, "omp_get_num_teams") == 0)
|
|
|| (strcmp (name, "omp_get_team_num") == 0)
|
|
|| (strcmp (name, "omp_get_level") == 0)
|
|
|| (strcmp (name, "omp_get_active_level") == 0)
|
|
|| (strcmp (name, "omp_in_parallel") == 0))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Facing a call satisfying grid_call_permissible_in_distribute_p in the body
|
|
of a distribute construct that is pointed at by GSI, modify it as necessary
|
|
for gridification. If the statement itself got removed, return true. */
|
|
|
|
static bool
|
|
grid_handle_call_in_distribute (gimple_stmt_iterator *gsi)
|
|
{
|
|
gimple *stmt = gsi_stmt (*gsi);
|
|
tree fndecl = gimple_call_fndecl (stmt);
|
|
gcc_checking_assert (stmt);
|
|
if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
|
|
return false;
|
|
|
|
const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
|
|
if ((strcmp (name, "omp_get_thread_num") == 0)
|
|
|| (strcmp (name, "omp_get_level") == 0)
|
|
|| (strcmp (name, "omp_get_active_level") == 0)
|
|
|| (strcmp (name, "omp_in_parallel") == 0))
|
|
{
|
|
tree lhs = gimple_call_lhs (stmt);
|
|
if (lhs)
|
|
{
|
|
gassign *assign
|
|
= gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
|
|
gsi_insert_before (gsi, assign, GSI_SAME_STMT);
|
|
}
|
|
gsi_remove (gsi, true);
|
|
return true;
|
|
}
|
|
|
|
/* The rest of the omp functions can stay as they are, HSA back-end will
|
|
handle them correctly. */
|
|
gcc_checking_assert ((strcmp (name, "omp_get_num_threads") == 0)
|
|
|| (strcmp (name, "omp_get_num_teams") == 0)
|
|
|| (strcmp (name, "omp_get_team_num") == 0));
|
|
return false;
|
|
}
|
|
|
|
/* Given a sequence of statements within a distribute omp construct or a
|
|
parallel construct, which in the original source does not form a compound
|
|
construct with a looping construct, return true if it does not prevent us
|
|
from turning it into a gridified HSA kernel. Otherwise return false. GRID
|
|
describes hitherto discovered properties of the loop that is evaluated for
|
|
possible gridification. IN_PARALLEL must be true if seq is within a
|
|
parallel construct and flase if it is only within a distribute
|
|
construct. */
|
|
|
|
static bool
|
|
grid_dist_follows_tiling_pattern (gimple_seq seq, grid_prop *grid,
|
|
bool in_parallel)
|
|
{
|
|
gimple_stmt_iterator gsi;
|
|
for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (grid_safe_assignment_p (stmt, grid)
|
|
|| gimple_code (stmt) == GIMPLE_GOTO
|
|
|| gimple_code (stmt) == GIMPLE_LABEL
|
|
|| gimple_code (stmt) == GIMPLE_COND)
|
|
continue;
|
|
else if (gbind *bind = dyn_cast <gbind *> (stmt))
|
|
{
|
|
if (!grid_dist_follows_tiling_pattern (gimple_bind_body (bind),
|
|
grid, in_parallel))
|
|
return false;
|
|
continue;
|
|
}
|
|
else if (gtry *try_stmt = dyn_cast <gtry *> (stmt))
|
|
{
|
|
if (gimple_try_kind (try_stmt) == GIMPLE_TRY_CATCH)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the distribute "
|
|
"construct contains a try..catch region\n");
|
|
dump_printf_loc (MSG_NOTE, try_stmt,
|
|
"This statement cannot be analyzed for "
|
|
"tiled gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
if (!grid_dist_follows_tiling_pattern (gimple_try_eval (try_stmt),
|
|
grid, in_parallel))
|
|
return false;
|
|
if (!grid_dist_follows_tiling_pattern (gimple_try_cleanup (try_stmt),
|
|
grid, in_parallel))
|
|
return false;
|
|
continue;
|
|
}
|
|
else if (is_gimple_call (stmt))
|
|
{
|
|
tree fndecl = gimple_call_fndecl (stmt);
|
|
if (fndecl && grid_call_permissible_in_distribute_p (fndecl))
|
|
continue;
|
|
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the distribute "
|
|
"construct contains a call\n");
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"This statement cannot be analyzed for "
|
|
"tiled gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
else if (gomp_parallel *par = dyn_cast <gomp_parallel *> (stmt))
|
|
{
|
|
if (in_parallel)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "a parallel "
|
|
"construct contains another parallel "
|
|
"construct\n");
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"This parallel construct is nested in "
|
|
"another one\n");
|
|
}
|
|
return false;
|
|
}
|
|
if (!grid_parallel_clauses_gridifiable (par, grid->target_loc)
|
|
|| !grid_dist_follows_tiling_pattern (gimple_omp_body (par),
|
|
grid, true))
|
|
return false;
|
|
}
|
|
else if (gomp_for *gfor = dyn_cast <gomp_for *> (stmt))
|
|
{
|
|
if (!in_parallel)
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "a loop "
|
|
"construct is not nested within a parallel "
|
|
"construct\n");
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"This loop construct is not nested in "
|
|
"a parallel construct\n");
|
|
}
|
|
return false;
|
|
}
|
|
if (!grid_gfor_follows_tiling_pattern (gfor, grid))
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
if (dump_enabled_p ())
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
|
|
GRID_MISSED_MSG_PREFIX "the distribute "
|
|
"construct contains a complex statement\n");
|
|
dump_printf_loc (MSG_NOTE, stmt,
|
|
"This statement cannot be analyzed for "
|
|
"tiled gridification\n");
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* If TARGET follows a pattern that can be turned into a gridified HSA kernel,
|
|
return true, otherwise return false. In the case of success, also fill in
|
|
GRID with information describing the kernel grid. */
|
|
|
|
static bool
|
|
grid_target_follows_gridifiable_pattern (gomp_target *target, grid_prop *grid)
|
|
{
|
|
if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
|
|
return false;
|
|
|
|
dump_user_location_t tloc = target;
|
|
grid->target_loc = tloc;
|
|
gimple *stmt
|
|
= grid_find_single_omp_among_assignments (gimple_omp_body (target),
|
|
grid, "target");
|
|
if (!stmt)
|
|
return false;
|
|
gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
|
|
tree group_size = NULL;
|
|
if (!teams)
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "it does not have a sole teams "
|
|
"construct in it.\n");
|
|
return false;
|
|
}
|
|
|
|
tree clauses = gimple_omp_teams_clauses (teams);
|
|
while (clauses)
|
|
{
|
|
switch (OMP_CLAUSE_CODE (clauses))
|
|
{
|
|
case OMP_CLAUSE_NUM_TEAMS:
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "the teams construct "
|
|
"contains a num_teams clause\n ");
|
|
return false;
|
|
|
|
case OMP_CLAUSE_REDUCTION:
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "a reduction "
|
|
"clause is present\n ");
|
|
return false;
|
|
|
|
case OMP_CLAUSE_THREAD_LIMIT:
|
|
if (!integer_zerop (OMP_CLAUSE_OPERAND (clauses, 0)))
|
|
group_size = OMP_CLAUSE_OPERAND (clauses, 0);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
clauses = OMP_CLAUSE_CHAIN (clauses);
|
|
}
|
|
|
|
stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), grid,
|
|
"teams");
|
|
if (!stmt)
|
|
return false;
|
|
gomp_for *dist = dyn_cast <gomp_for *> (stmt);
|
|
if (!dist)
|
|
{
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "the teams construct does not "
|
|
"have a single distribute construct in it.\n");
|
|
return false;
|
|
}
|
|
|
|
gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
|
|
|
|
grid->collapse = gimple_omp_for_collapse (dist);
|
|
if (grid->collapse > 3)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "the distribute construct "
|
|
"contains collapse clause with parameter greater "
|
|
"than 3\n");
|
|
return false;
|
|
}
|
|
|
|
struct omp_for_data fd;
|
|
struct omp_for_data_loop *dist_loops
|
|
= (struct omp_for_data_loop *)alloca (grid->collapse
|
|
* sizeof (struct omp_for_data_loop));
|
|
omp_extract_for_data (dist, &fd, dist_loops);
|
|
if (fd.chunk_size)
|
|
{
|
|
if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "the teams "
|
|
"thread limit is different from distribute "
|
|
"schedule chunk\n");
|
|
return false;
|
|
}
|
|
group_size = fd.chunk_size;
|
|
}
|
|
if (group_size && grid->collapse > 1)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "group size cannot be "
|
|
"set using thread_limit or schedule clauses "
|
|
"when also using a collapse clause greater than 1\n");
|
|
return false;
|
|
}
|
|
|
|
if (gimple_omp_for_combined_p (dist))
|
|
{
|
|
grid->tiling = false;
|
|
grid->group_sizes[0] = group_size;
|
|
for (unsigned i = 1; i < grid->collapse; i++)
|
|
grid->group_sizes[i] = NULL;
|
|
return grid_dist_follows_simple_pattern (dist, grid);
|
|
}
|
|
else
|
|
{
|
|
grid->tiling = true;
|
|
if (group_size)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
|
|
GRID_MISSED_MSG_PREFIX "group size cannot be set "
|
|
"using thread_limit or schedule clauses when "
|
|
"distribute and loop constructs do not form "
|
|
"one combined construct\n");
|
|
return false;
|
|
}
|
|
for (unsigned i = 0; i < grid->collapse; i++)
|
|
{
|
|
if (fd.loops[i].cond_code == GT_EXPR)
|
|
grid->group_sizes[i] = fold_build1 (NEGATE_EXPR,
|
|
TREE_TYPE (fd.loops[i].step),
|
|
fd.loops[i].step);
|
|
else
|
|
grid->group_sizes[i] = fd.loops[i].step;
|
|
}
|
|
return grid_dist_follows_tiling_pattern (gimple_omp_body (dist), grid,
|
|
false);
|
|
}
|
|
}
|
|
|
|
/* Operand walker, used to remap pre-body declarations according to a hash map
|
|
provided in DATA. */
|
|
|
|
static tree
|
|
grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
|
|
{
|
|
tree t = *tp;
|
|
|
|
if (DECL_P (t) || TYPE_P (t))
|
|
*walk_subtrees = 0;
|
|
else
|
|
*walk_subtrees = 1;
|
|
|
|
if (VAR_P (t))
|
|
{
|
|
struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
|
|
hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
|
|
tree *repl = declmap->get (t);
|
|
if (repl)
|
|
*tp = *repl;
|
|
}
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Identifiers of segments into which a particular variable should be places
|
|
when gridifying. */
|
|
|
|
enum grid_var_segment {GRID_SEGMENT_PRIVATE, GRID_SEGMENT_GROUP,
|
|
GRID_SEGMENT_GLOBAL};
|
|
|
|
/* Mark VAR so that it is eventually placed into SEGMENT. Place an artificial
|
|
builtin call into SEQ that will make sure the variable is always considered
|
|
address taken. */
|
|
|
|
static void
|
|
grid_mark_variable_segment (tree var, enum grid_var_segment segment)
|
|
{
|
|
/* Making a non-addressable variables would require that we re-gimplify all
|
|
their uses. Fortunately, we do not have to do this because if they are
|
|
not addressable, it means they are not used in atomic or parallel
|
|
statements and so relaxed GPU consistency rules mean we can just keep them
|
|
private. */
|
|
if (!TREE_ADDRESSABLE (var))
|
|
return;
|
|
|
|
switch (segment)
|
|
{
|
|
case GRID_SEGMENT_GROUP:
|
|
DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_group_segment"),
|
|
NULL, DECL_ATTRIBUTES (var));
|
|
break;
|
|
case GRID_SEGMENT_GLOBAL:
|
|
DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_global_segment"),
|
|
NULL, DECL_ATTRIBUTES (var));
|
|
break;
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
|
|
if (!TREE_STATIC (var))
|
|
{
|
|
TREE_STATIC (var) = 1;
|
|
varpool_node::finalize_decl (var);
|
|
}
|
|
|
|
}
|
|
|
|
/* Copy leading register-type assignments to local variables in SRC to just
|
|
before DST, Creating temporaries, adjusting mapping of operands in WI and
|
|
remapping operands as necessary. Add any new temporaries to TGT_BIND.
|
|
Return the first statement that does not conform to grid_safe_assignment_p
|
|
or NULL. If VAR_SEGMENT is not GRID_SEGMENT_PRIVATE, also mark all
|
|
variables in traversed bind statements so that they are put into the
|
|
appropriate segment. */
|
|
|
|
static gimple *
|
|
grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
|
|
gbind *tgt_bind,
|
|
enum grid_var_segment var_segment,
|
|
struct walk_stmt_info *wi)
|
|
{
|
|
hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
|
|
gimple_stmt_iterator gsi;
|
|
for (gsi = gsi_start (src); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
if (gbind *bind = dyn_cast <gbind *> (stmt))
|
|
{
|
|
gimple *r = grid_copy_leading_local_assignments
|
|
(gimple_bind_body (bind), dst, tgt_bind, var_segment, wi);
|
|
|
|
if (var_segment != GRID_SEGMENT_PRIVATE)
|
|
for (tree var = gimple_bind_vars (bind);
|
|
var;
|
|
var = DECL_CHAIN (var))
|
|
grid_mark_variable_segment (var, var_segment);
|
|
if (r)
|
|
return r;
|
|
else
|
|
continue;
|
|
}
|
|
if (!grid_safe_assignment_p (stmt, NULL))
|
|
return stmt;
|
|
tree lhs = gimple_assign_lhs (as_a <gassign *> (stmt));
|
|
tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL),
|
|
TREE_TYPE (lhs));
|
|
DECL_CONTEXT (repl) = current_function_decl;
|
|
gimple_bind_append_vars (tgt_bind, repl);
|
|
|
|
declmap->put (lhs, repl);
|
|
gassign *copy = as_a <gassign *> (gimple_copy (stmt));
|
|
walk_gimple_op (copy, grid_remap_prebody_decls, wi);
|
|
gsi_insert_before (dst, copy, GSI_SAME_STMT);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Statement walker function to make adjustments to statements within the
|
|
gridifed kernel copy. */
|
|
|
|
static tree
|
|
grid_process_grid_body (gimple_stmt_iterator *gsi, bool *handled_ops_p,
|
|
struct walk_stmt_info *)
|
|
{
|
|
*handled_ops_p = false;
|
|
gimple *stmt = gsi_stmt (*gsi);
|
|
if (gimple_code (stmt) == GIMPLE_OMP_FOR
|
|
&& (gimple_omp_for_kind (stmt) & GF_OMP_FOR_SIMD))
|
|
{
|
|
gomp_for *loop = as_a <gomp_for *> (stmt);
|
|
tree clauses = gimple_omp_for_clauses (loop);
|
|
tree cl = omp_find_clause (clauses, OMP_CLAUSE_SAFELEN);
|
|
if (cl)
|
|
OMP_CLAUSE_SAFELEN_EXPR (cl) = integer_one_node;
|
|
else
|
|
{
|
|
tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_SAFELEN);
|
|
OMP_CLAUSE_SAFELEN_EXPR (c) = integer_one_node;
|
|
OMP_CLAUSE_CHAIN (c) = clauses;
|
|
gimple_omp_for_set_clauses (loop, c);
|
|
}
|
|
}
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Given a PARLOOP that is a normal for looping construct but also a part of a
|
|
combined construct with a simd loop, eliminate the simd loop. */
|
|
|
|
static void
|
|
grid_eliminate_combined_simd_part (gomp_for *parloop)
|
|
{
|
|
struct walk_stmt_info wi;
|
|
|
|
memset (&wi, 0, sizeof (wi));
|
|
wi.val_only = true;
|
|
enum gf_mask msk = GF_OMP_FOR_SIMD;
|
|
wi.info = (void *) &msk;
|
|
walk_gimple_seq (gimple_omp_body (parloop), omp_find_combined_for, NULL, &wi);
|
|
gimple *stmt = (gimple *) wi.info;
|
|
/* We expect that the SIMD id the only statement in the parallel loop. */
|
|
gcc_assert (stmt
|
|
&& gimple_code (stmt) == GIMPLE_OMP_FOR
|
|
&& (gimple_omp_for_kind (stmt) == GF_OMP_FOR_SIMD)
|
|
&& gimple_omp_for_combined_into_p (stmt)
|
|
&& !gimple_omp_for_combined_p (stmt));
|
|
gomp_for *simd = as_a <gomp_for *> (stmt);
|
|
|
|
/* Copy over the iteration properties because the body refers to the index in
|
|
the bottmom-most loop. */
|
|
unsigned i, collapse = gimple_omp_for_collapse (parloop);
|
|
gcc_checking_assert (collapse == gimple_omp_for_collapse (simd));
|
|
for (i = 0; i < collapse; i++)
|
|
{
|
|
gimple_omp_for_set_index (parloop, i, gimple_omp_for_index (simd, i));
|
|
gimple_omp_for_set_initial (parloop, i, gimple_omp_for_initial (simd, i));
|
|
gimple_omp_for_set_final (parloop, i, gimple_omp_for_final (simd, i));
|
|
gimple_omp_for_set_incr (parloop, i, gimple_omp_for_incr (simd, i));
|
|
}
|
|
|
|
tree *tgt= gimple_omp_for_clauses_ptr (parloop);
|
|
while (*tgt)
|
|
tgt = &OMP_CLAUSE_CHAIN (*tgt);
|
|
|
|
/* Copy over all clauses, except for linaer clauses, which are turned into
|
|
private clauses, and all other simd-specificl clauses, which are
|
|
ignored. */
|
|
tree *pc = gimple_omp_for_clauses_ptr (simd);
|
|
while (*pc)
|
|
{
|
|
tree c = *pc;
|
|
switch (TREE_CODE (c))
|
|
{
|
|
case OMP_CLAUSE_LINEAR:
|
|
{
|
|
tree priv = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_PRIVATE);
|
|
OMP_CLAUSE_DECL (priv) = OMP_CLAUSE_DECL (c);
|
|
OMP_CLAUSE_CHAIN (priv) = NULL;
|
|
*tgt = priv;
|
|
tgt = &OMP_CLAUSE_CHAIN (priv);
|
|
pc = &OMP_CLAUSE_CHAIN (c);
|
|
break;
|
|
}
|
|
|
|
case OMP_CLAUSE_SAFELEN:
|
|
case OMP_CLAUSE_SIMDLEN:
|
|
case OMP_CLAUSE_ALIGNED:
|
|
pc = &OMP_CLAUSE_CHAIN (c);
|
|
break;
|
|
|
|
default:
|
|
*pc = OMP_CLAUSE_CHAIN (c);
|
|
OMP_CLAUSE_CHAIN (c) = NULL;
|
|
*tgt = c;
|
|
tgt = &OMP_CLAUSE_CHAIN(c);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Finally, throw away the simd and mark the parallel loop as not
|
|
combined. */
|
|
gimple_omp_set_body (parloop, gimple_omp_body (simd));
|
|
gimple_omp_for_set_combined_p (parloop, false);
|
|
}
|
|
|
|
/* Statement walker function marking all parallels as grid_phony and loops as
|
|
grid ones representing threads of a particular thread group. */
|
|
|
|
static tree
|
|
grid_mark_tiling_loops (gimple_stmt_iterator *gsi, bool *handled_ops_p,
|
|
struct walk_stmt_info *wi_in)
|
|
{
|
|
*handled_ops_p = false;
|
|
if (gomp_for *loop = dyn_cast <gomp_for *> (gsi_stmt (*gsi)))
|
|
{
|
|
*handled_ops_p = true;
|
|
gimple_omp_for_set_kind (loop, GF_OMP_FOR_KIND_GRID_LOOP);
|
|
gimple_omp_for_set_grid_intra_group (loop, true);
|
|
if (gimple_omp_for_combined_p (loop))
|
|
grid_eliminate_combined_simd_part (loop);
|
|
|
|
struct walk_stmt_info body_wi;
|
|
memset (&body_wi, 0, sizeof (body_wi));
|
|
walk_gimple_seq_mod (gimple_omp_body_ptr (loop),
|
|
grid_process_grid_body, NULL, &body_wi);
|
|
|
|
gbind *bind = (gbind *) wi_in->info;
|
|
tree c;
|
|
for (c = gimple_omp_for_clauses (loop); c; c = OMP_CLAUSE_CHAIN (c))
|
|
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE)
|
|
{
|
|
push_gimplify_context ();
|
|
tree ov = OMP_CLAUSE_DECL (c);
|
|
tree gv = copy_var_decl (ov, create_tmp_var_name (NULL),
|
|
TREE_TYPE (ov));
|
|
|
|
grid_mark_variable_segment (gv, GRID_SEGMENT_GROUP);
|
|
DECL_CONTEXT (gv) = current_function_decl;
|
|
gimple_bind_append_vars (bind, gv);
|
|
tree x = lang_hooks.decls.omp_clause_assign_op (c, gv, ov);
|
|
gimplify_and_add (x, &OMP_CLAUSE_LASTPRIVATE_GIMPLE_SEQ (c));
|
|
x = lang_hooks.decls.omp_clause_copy_ctor (c, ov, gv);
|
|
gimple_seq l = NULL;
|
|
gimplify_and_add (x, &l);
|
|
gsi_insert_seq_after (gsi, l, GSI_SAME_STMT);
|
|
pop_gimplify_context (bind);
|
|
}
|
|
}
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Statement walker function marking all parallels as grid_phony and loops as
|
|
grid ones representing threads of a particular thread group. */
|
|
|
|
static tree
|
|
grid_mark_tiling_parallels_and_loops (gimple_stmt_iterator *gsi,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *wi_in)
|
|
{
|
|
*handled_ops_p = false;
|
|
wi_in->removed_stmt = false;
|
|
gimple *stmt = gsi_stmt (*gsi);
|
|
if (gbind *bind = dyn_cast <gbind *> (stmt))
|
|
{
|
|
for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
|
|
grid_mark_variable_segment (var, GRID_SEGMENT_GROUP);
|
|
}
|
|
else if (gomp_parallel *parallel = dyn_cast <gomp_parallel *> (stmt))
|
|
{
|
|
*handled_ops_p = true;
|
|
gimple_omp_parallel_set_grid_phony (parallel, true);
|
|
|
|
gbind *new_bind = gimple_build_bind (NULL, NULL, make_node (BLOCK));
|
|
gimple_bind_set_body (new_bind, gimple_omp_body (parallel));
|
|
gimple_seq s = NULL;
|
|
gimple_seq_add_stmt (&s, new_bind);
|
|
gimple_omp_set_body (parallel, s);
|
|
|
|
struct walk_stmt_info wi_par;
|
|
memset (&wi_par, 0, sizeof (wi_par));
|
|
wi_par.info = new_bind;
|
|
walk_gimple_seq_mod (gimple_bind_body_ptr (new_bind),
|
|
grid_mark_tiling_loops, NULL, &wi_par);
|
|
}
|
|
else if (is_a <gcall *> (stmt))
|
|
wi_in->removed_stmt = grid_handle_call_in_distribute (gsi);
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Given freshly copied top level kernel SEQ, identify the individual OMP
|
|
components, mark them as part of kernel, copy assignment leading to them
|
|
just before DST, remapping them using WI and adding new temporaries to
|
|
TGT_BIND, and and return the loop that will be used for kernel dispatch. */
|
|
|
|
static gomp_for *
|
|
grid_process_kernel_body_copy (grid_prop *grid, gimple_seq seq,
|
|
gimple_stmt_iterator *dst,
|
|
gbind *tgt_bind, struct walk_stmt_info *wi)
|
|
{
|
|
gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind,
|
|
GRID_SEGMENT_GLOBAL, wi);
|
|
gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
|
|
gcc_assert (teams);
|
|
gimple_omp_teams_set_grid_phony (teams, true);
|
|
stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst,
|
|
tgt_bind, GRID_SEGMENT_GLOBAL,
|
|
wi);
|
|
gcc_checking_assert (stmt);
|
|
gomp_for *dist = dyn_cast <gomp_for *> (stmt);
|
|
gcc_assert (dist);
|
|
gimple_seq prebody = gimple_omp_for_pre_body (dist);
|
|
if (prebody)
|
|
grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
|
|
GRID_SEGMENT_GROUP, wi);
|
|
|
|
if (grid->tiling)
|
|
{
|
|
gimple_omp_for_set_kind (dist, GF_OMP_FOR_KIND_GRID_LOOP);
|
|
gimple_omp_for_set_grid_group_iter (dist, true);
|
|
|
|
struct walk_stmt_info wi_tiled;
|
|
memset (&wi_tiled, 0, sizeof (wi_tiled));
|
|
walk_gimple_seq_mod (gimple_omp_body_ptr (dist),
|
|
grid_mark_tiling_parallels_and_loops, NULL,
|
|
&wi_tiled);
|
|
return dist;
|
|
}
|
|
else
|
|
{
|
|
gimple_omp_for_set_grid_phony (dist, true);
|
|
stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
|
|
tgt_bind,
|
|
GRID_SEGMENT_PRIVATE, wi);
|
|
gcc_checking_assert (stmt);
|
|
gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
|
|
gimple_omp_parallel_set_grid_phony (parallel, true);
|
|
stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel),
|
|
dst, tgt_bind,
|
|
GRID_SEGMENT_PRIVATE, wi);
|
|
gomp_for *inner_loop = as_a <gomp_for *> (stmt);
|
|
gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
|
|
prebody = gimple_omp_for_pre_body (inner_loop);
|
|
if (prebody)
|
|
grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
|
|
GRID_SEGMENT_PRIVATE, wi);
|
|
|
|
if (gimple_omp_for_combined_p (inner_loop))
|
|
grid_eliminate_combined_simd_part (inner_loop);
|
|
struct walk_stmt_info body_wi;
|
|
memset (&body_wi, 0, sizeof (body_wi));
|
|
walk_gimple_seq_mod (gimple_omp_body_ptr (inner_loop),
|
|
grid_process_grid_body, NULL, &body_wi);
|
|
|
|
return inner_loop;
|
|
}
|
|
}
|
|
|
|
/* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern,
|
|
create a GPU kernel for it. GSI must point to the same statement, TGT_BIND
|
|
is the bind into which temporaries inserted before TARGET should be
|
|
added. */
|
|
|
|
static void
|
|
grid_attempt_target_gridification (gomp_target *target,
|
|
gimple_stmt_iterator *gsi,
|
|
gbind *tgt_bind)
|
|
{
|
|
/* removed group_size */
|
|
grid_prop grid = {};
|
|
if (!target || !grid_target_follows_gridifiable_pattern (target, &grid))
|
|
return;
|
|
|
|
location_t loc = gimple_location (target);
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, target,
|
|
"Target construct will be turned into a gridified HSA "
|
|
"kernel\n");
|
|
|
|
/* Copy target body to a GPUKERNEL construct: */
|
|
gimple_seq kernel_seq = copy_gimple_seq_and_replace_locals
|
|
(gimple_omp_body (target));
|
|
|
|
hash_map<tree, tree> *declmap = new hash_map<tree, tree>;
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (struct walk_stmt_info));
|
|
wi.info = declmap;
|
|
|
|
/* Copy assignments in between OMP statements before target, mark OMP
|
|
statements within copy appropriately. */
|
|
gomp_for *inner_loop = grid_process_kernel_body_copy (&grid, kernel_seq, gsi,
|
|
tgt_bind, &wi);
|
|
|
|
gbind *old_bind
|
|
= as_a <gbind *> (gimple_seq_first (gimple_omp_body (target)));
|
|
gbind *new_bind = as_a <gbind *> (gimple_seq_first (kernel_seq));
|
|
tree new_block = gimple_bind_block (new_bind);
|
|
tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind));
|
|
BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block);
|
|
BLOCK_SUBBLOCKS (enc_block) = new_block;
|
|
BLOCK_SUPERCONTEXT (new_block) = enc_block;
|
|
gimple *gpukernel = gimple_build_omp_grid_body (kernel_seq);
|
|
gimple_seq_add_stmt
|
|
(gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))),
|
|
gpukernel);
|
|
|
|
for (size_t i = 0; i < grid.collapse; i++)
|
|
walk_tree (&grid.group_sizes[i], grid_remap_prebody_decls, &wi, NULL);
|
|
push_gimplify_context ();
|
|
for (size_t i = 0; i < grid.collapse; i++)
|
|
{
|
|
tree itype, type = TREE_TYPE (gimple_omp_for_index (inner_loop, i));
|
|
if (POINTER_TYPE_P (type))
|
|
itype = signed_type_for (type);
|
|
else
|
|
itype = type;
|
|
|
|
enum tree_code cond_code = gimple_omp_for_cond (inner_loop, i);
|
|
tree n1 = unshare_expr (gimple_omp_for_initial (inner_loop, i));
|
|
walk_tree (&n1, grid_remap_prebody_decls, &wi, NULL);
|
|
tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i));
|
|
walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL);
|
|
omp_adjust_for_condition (loc, &cond_code, &n2);
|
|
n1 = fold_convert (itype, n1);
|
|
n2 = fold_convert (itype, n2);
|
|
|
|
tree cond = fold_build2 (cond_code, boolean_type_node, n1, n2);
|
|
tree step
|
|
= omp_get_for_step_from_incr (loc, gimple_omp_for_incr (inner_loop, i));
|
|
|
|
tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1));
|
|
t = fold_build2 (PLUS_EXPR, itype, step, t);
|
|
t = fold_build2 (PLUS_EXPR, itype, t, n2);
|
|
t = fold_build2 (MINUS_EXPR, itype, t, n1);
|
|
if (TYPE_UNSIGNED (itype) && cond_code == GT_EXPR)
|
|
t = fold_build2 (TRUNC_DIV_EXPR, itype,
|
|
fold_build1 (NEGATE_EXPR, itype, t),
|
|
fold_build1 (NEGATE_EXPR, itype, step));
|
|
else
|
|
t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
|
|
t = fold_build3 (COND_EXPR, itype, cond, t, build_zero_cst (itype));
|
|
if (grid.tiling)
|
|
{
|
|
if (cond_code == GT_EXPR)
|
|
step = fold_build1 (NEGATE_EXPR, itype, step);
|
|
t = fold_build2 (MULT_EXPR, itype, t, step);
|
|
}
|
|
|
|
tree gs = fold_convert (uint32_type_node, t);
|
|
gimple_seq tmpseq = NULL;
|
|
gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue);
|
|
if (!gimple_seq_empty_p (tmpseq))
|
|
gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
|
|
|
|
tree ws;
|
|
if (grid.group_sizes[i])
|
|
{
|
|
ws = fold_convert (uint32_type_node, grid.group_sizes[i]);
|
|
tmpseq = NULL;
|
|
gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue);
|
|
if (!gimple_seq_empty_p (tmpseq))
|
|
gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
|
|
}
|
|
else
|
|
ws = build_zero_cst (uint32_type_node);
|
|
|
|
tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__GRIDDIM_);
|
|
OMP_CLAUSE__GRIDDIM__DIMENSION (c) = i;
|
|
OMP_CLAUSE__GRIDDIM__SIZE (c) = gs;
|
|
OMP_CLAUSE__GRIDDIM__GROUP (c) = ws;
|
|
OMP_CLAUSE_CHAIN (c) = gimple_omp_target_clauses (target);
|
|
gimple_omp_target_set_clauses (target, c);
|
|
}
|
|
pop_gimplify_context (tgt_bind);
|
|
delete declmap;
|
|
return;
|
|
}
|
|
|
|
/* Walker function doing all the work for create_target_kernels. */
|
|
|
|
static tree
|
|
grid_gridify_all_targets_stmt (gimple_stmt_iterator *gsi,
|
|
bool *handled_ops_p,
|
|
struct walk_stmt_info *incoming)
|
|
{
|
|
*handled_ops_p = false;
|
|
|
|
gimple *stmt = gsi_stmt (*gsi);
|
|
gomp_target *target = dyn_cast <gomp_target *> (stmt);
|
|
if (target)
|
|
{
|
|
gbind *tgt_bind = (gbind *) incoming->info;
|
|
gcc_checking_assert (tgt_bind);
|
|
grid_attempt_target_gridification (target, gsi, tgt_bind);
|
|
return NULL_TREE;
|
|
}
|
|
gbind *bind = dyn_cast <gbind *> (stmt);
|
|
if (bind)
|
|
{
|
|
*handled_ops_p = true;
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
wi.info = bind;
|
|
walk_gimple_seq_mod (gimple_bind_body_ptr (bind),
|
|
grid_gridify_all_targets_stmt, NULL, &wi);
|
|
}
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Attempt to gridify all target constructs in BODY_P. All such targets will
|
|
have their bodies duplicated, with the new copy being put into a
|
|
gimple_omp_grid_body statement. All kernel-related construct within the
|
|
grid_body will be marked with phony flags or kernel kinds. Moreover, some
|
|
re-structuring is often needed, such as copying pre-bodies before the target
|
|
construct so that kernel grid sizes can be computed. */
|
|
|
|
void
|
|
omp_grid_gridify_all_targets (gimple_seq *body_p)
|
|
{
|
|
struct walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
walk_gimple_seq_mod (body_p, grid_gridify_all_targets_stmt, NULL, &wi);
|
|
}
|