a554497024
From-SVN: r267494
595 lines
18 KiB
C
595 lines
18 KiB
C
/* Loop unroll-and-jam.
|
|
Copyright (C) 2017-2019 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by the
|
|
Free Software Foundation; either version 3, or (at your option) any
|
|
later version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "params.h"
|
|
#include "tree-pass.h"
|
|
#include "backend.h"
|
|
#include "tree.h"
|
|
#include "gimple.h"
|
|
#include "ssa.h"
|
|
#include "fold-const.h"
|
|
#include "tree-cfg.h"
|
|
#include "tree-ssa.h"
|
|
#include "tree-ssa-loop-niter.h"
|
|
#include "tree-ssa-loop.h"
|
|
#include "tree-ssa-loop-manip.h"
|
|
#include "cfgloop.h"
|
|
#include "tree-scalar-evolution.h"
|
|
#include "gimple-iterator.h"
|
|
#include "cfghooks.h"
|
|
#include "tree-data-ref.h"
|
|
#include "tree-ssa-loop-ivopts.h"
|
|
#include "tree-vectorizer.h"
|
|
|
|
/* Unroll and Jam transformation
|
|
|
|
This is a combination of two transformations, where the second
|
|
is not always valid. It's applicable if a loop nest has redundancies
|
|
over the iterations of an outer loop while not having that with
|
|
an inner loop.
|
|
|
|
Given this nest:
|
|
for (i) {
|
|
for (j) {
|
|
B(i,j)
|
|
}
|
|
}
|
|
|
|
first unroll:
|
|
for (i by 2) {
|
|
for (j) {
|
|
B(i,j)
|
|
}
|
|
for (j) {
|
|
B(i+1,j)
|
|
}
|
|
}
|
|
|
|
then fuse the two adjacent inner loops resulting from that:
|
|
for (i by 2) {
|
|
for (j) {
|
|
B(i,j)
|
|
B(i+1,j)
|
|
}
|
|
}
|
|
|
|
As the order of evaluations of the body B changes this is valid
|
|
only in certain situations: all distance vectors need to be forward.
|
|
Additionally if there are multiple induction variables than just
|
|
a counting control IV (j above) we can also deal with some situations.
|
|
|
|
The validity is checked by unroll_jam_possible_p, and the data-dep
|
|
testing below.
|
|
|
|
A trivial example where the fusion is wrong would be when
|
|
B(i,j) == x[j-1] = x[j];
|
|
for (i by 2) {
|
|
for (j) {
|
|
x[j-1] = x[j];
|
|
}
|
|
for (j) {
|
|
x[j-1] = x[j];
|
|
}
|
|
} effect: move content to front by two elements
|
|
-->
|
|
for (i by 2) {
|
|
for (j) {
|
|
x[j-1] = x[j];
|
|
x[j-1] = x[j];
|
|
}
|
|
} effect: move content to front by one element
|
|
*/
|
|
|
|
/* Modify the loop tree for the fact that all code once belonging
|
|
to the OLD loop or the outer loop of OLD now is inside LOOP. */
|
|
|
|
static void
|
|
merge_loop_tree (struct loop *loop, struct loop *old)
|
|
{
|
|
basic_block *bbs;
|
|
int i, n;
|
|
struct loop *subloop;
|
|
edge e;
|
|
edge_iterator ei;
|
|
|
|
/* Find its nodes. */
|
|
bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
|
n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
|
|
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
/* If the block was direct child of OLD loop it's now part
|
|
of LOOP. If it was outside OLD, then it moved into LOOP
|
|
as well. This avoids changing the loop father for BBs
|
|
in inner loops of OLD. */
|
|
if (bbs[i]->loop_father == old
|
|
|| loop_depth (bbs[i]->loop_father) < loop_depth (old))
|
|
{
|
|
remove_bb_from_loops (bbs[i]);
|
|
add_bb_to_loop (bbs[i], loop);
|
|
continue;
|
|
}
|
|
|
|
/* If we find a direct subloop of OLD, move it to LOOP. */
|
|
subloop = bbs[i]->loop_father;
|
|
if (loop_outer (subloop) == old && subloop->header == bbs[i])
|
|
{
|
|
flow_loop_tree_node_remove (subloop);
|
|
flow_loop_tree_node_add (loop, subloop);
|
|
}
|
|
}
|
|
|
|
/* Update the information about loop exit edges. */
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
FOR_EACH_EDGE (e, ei, bbs[i]->succs)
|
|
{
|
|
rescan_loop_exit (e, false, false);
|
|
}
|
|
}
|
|
|
|
loop->num_nodes = n;
|
|
|
|
free (bbs);
|
|
}
|
|
|
|
/* BB is part of the outer loop of an unroll-and-jam situation.
|
|
Check if any statements therein would prevent the transformation. */
|
|
|
|
static bool
|
|
bb_prevents_fusion_p (basic_block bb)
|
|
{
|
|
gimple_stmt_iterator gsi;
|
|
/* BB is duplicated by outer unrolling and then all N-1 first copies
|
|
move into the body of the fused inner loop. If BB exits the outer loop
|
|
the last copy still does so, and the first N-1 copies are cancelled
|
|
by loop unrolling, so also after fusion it's the exit block.
|
|
But there might be other reasons that prevent fusion:
|
|
* stores or unknown side-effects prevent fusion
|
|
* loads don't
|
|
* computations into SSA names: these aren't problematic. Their
|
|
result will be unused on the exit edges of the first N-1 copies
|
|
(those aren't taken after unrolling). If they are used on the
|
|
other edge (the one leading to the outer latch block) they are
|
|
loop-carried (on the outer loop) and the Nth copy of BB will
|
|
compute them again (i.e. the first N-1 copies will be dead). */
|
|
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *g = gsi_stmt (gsi);
|
|
if (gimple_vdef (g) || gimple_has_side_effects (g))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/* Given an inner loop LOOP (of some OUTER loop) determine if
|
|
we can safely fuse copies of it (generated by outer unrolling).
|
|
If so return true, otherwise return false. */
|
|
|
|
static bool
|
|
unroll_jam_possible_p (struct loop *outer, struct loop *loop)
|
|
{
|
|
basic_block *bbs;
|
|
int i, n;
|
|
struct tree_niter_desc niter;
|
|
|
|
/* When fusing the loops we skip the latch block
|
|
of the first one, so it mustn't have any effects to
|
|
preserve. */
|
|
if (!empty_block_p (loop->latch))
|
|
return false;
|
|
|
|
if (!single_exit (loop))
|
|
return false;
|
|
|
|
/* We need a perfect nest. Quick check for adjacent inner loops. */
|
|
if (outer->inner != loop || loop->next)
|
|
return false;
|
|
|
|
/* Prevent head-controlled inner loops, that we usually have.
|
|
The guard block would need to be accepted
|
|
(invariant condition either entering or skipping the loop),
|
|
without also accepting arbitrary control flow. When unswitching
|
|
ran before us (as with -O3) this won't be a problem because its
|
|
outer loop unswitching will have moved out the invariant condition.
|
|
|
|
If we do that we need to extend fuse_loops() to cope with this
|
|
by threading through the (still invariant) copied condition
|
|
between the two loop copies. */
|
|
if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
|
|
return false;
|
|
|
|
/* The number of iterations of the inner loop must be loop invariant
|
|
with respect to the outer loop. */
|
|
if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
|
|
false, true)
|
|
|| niter.cmp == ERROR_MARK
|
|
|| !integer_zerop (niter.may_be_zero)
|
|
|| !expr_invariant_in_loop_p (outer, niter.niter))
|
|
return false;
|
|
|
|
/* If the inner loop produces any values that are used inside the
|
|
outer loop (except the virtual op) then it can flow
|
|
back (perhaps indirectly) into the inner loop. This prevents
|
|
fusion: without fusion the value at the last iteration is used,
|
|
with fusion the value after the initial iteration is used.
|
|
|
|
If all uses are outside the outer loop this doesn't prevent fusion;
|
|
the value of the last iteration is still used (and the values from
|
|
all intermediate iterations are dead). */
|
|
gphi_iterator psi;
|
|
for (psi = gsi_start_phis (single_exit (loop)->dest);
|
|
!gsi_end_p (psi); gsi_next (&psi))
|
|
{
|
|
imm_use_iterator imm_iter;
|
|
use_operand_p use_p;
|
|
tree op = gimple_phi_result (psi.phi ());
|
|
if (virtual_operand_p (op))
|
|
continue;
|
|
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
|
|
{
|
|
gimple *use_stmt = USE_STMT (use_p);
|
|
if (!is_gimple_debug (use_stmt)
|
|
&& flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* And check blocks belonging to just outer loop. */
|
|
bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
|
n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
|
|
|
|
for (i = 0; i < n; i++)
|
|
if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
|
|
break;
|
|
free (bbs);
|
|
if (i != n)
|
|
return false;
|
|
|
|
/* For now we can safely fuse copies of LOOP only if all
|
|
loop carried variables are inductions (or the virtual op).
|
|
|
|
We could handle reductions as well (the initial value in the second
|
|
body would be the after-iter value of the first body) if it's over
|
|
an associative and commutative operation. We wouldn't
|
|
be able to handle unknown cycles. */
|
|
for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
|
|
{
|
|
affine_iv iv;
|
|
tree op = gimple_phi_result (psi.phi ());
|
|
|
|
if (virtual_operand_p (op))
|
|
continue;
|
|
if (!simple_iv (loop, loop, op, &iv, true))
|
|
return false;
|
|
/* The inductions must be regular, loop invariant step and initial
|
|
value. */
|
|
if (!expr_invariant_in_loop_p (outer, iv.step)
|
|
|| !expr_invariant_in_loop_p (outer, iv.base))
|
|
return false;
|
|
/* XXX With more effort we could also be able to deal with inductions
|
|
where the initial value is loop variant but a simple IV in the
|
|
outer loop. The initial value for the second body would be
|
|
the original initial value plus iv.base.step. The next value
|
|
for the fused loop would be the original next value of the first
|
|
copy, _not_ the next value of the second body. */
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Fuse LOOP with all further neighbors. The loops are expected to
|
|
be in appropriate form. */
|
|
|
|
static void
|
|
fuse_loops (struct loop *loop)
|
|
{
|
|
struct loop *next = loop->next;
|
|
|
|
while (next)
|
|
{
|
|
edge e;
|
|
|
|
remove_branch (single_pred_edge (loop->latch));
|
|
/* Make delete_basic_block not fiddle with the loop structure. */
|
|
basic_block oldlatch = loop->latch;
|
|
loop->latch = NULL;
|
|
delete_basic_block (oldlatch);
|
|
e = redirect_edge_and_branch (loop_latch_edge (next),
|
|
loop->header);
|
|
loop->latch = e->src;
|
|
flush_pending_stmts (e);
|
|
|
|
gcc_assert (EDGE_COUNT (next->header->preds) == 1);
|
|
|
|
/* The PHI nodes of the second body (single-argument now)
|
|
need adjustments to use the right values: either directly
|
|
the value of the corresponding PHI in the first copy or
|
|
the one leaving the first body which unrolling did for us.
|
|
|
|
See also unroll_jam_possible_p() for further possibilities. */
|
|
gphi_iterator psi_first, psi_second;
|
|
e = single_pred_edge (next->header);
|
|
for (psi_first = gsi_start_phis (loop->header),
|
|
psi_second = gsi_start_phis (next->header);
|
|
!gsi_end_p (psi_first);
|
|
gsi_next (&psi_first), gsi_next (&psi_second))
|
|
{
|
|
gphi *phi_first = psi_first.phi ();
|
|
gphi *phi_second = psi_second.phi ();
|
|
tree firstop = gimple_phi_result (phi_first);
|
|
/* The virtual operand is correct already as it's
|
|
always live at exit, hence has a LCSSA node and outer
|
|
loop unrolling updated SSA form. */
|
|
if (virtual_operand_p (firstop))
|
|
continue;
|
|
|
|
/* Due to unroll_jam_possible_p() we know that this is
|
|
an induction. The second body goes over the same
|
|
iteration space. */
|
|
add_phi_arg (phi_second, firstop, e,
|
|
gimple_location (phi_first));
|
|
}
|
|
gcc_assert (gsi_end_p (psi_second));
|
|
|
|
merge_loop_tree (loop, next);
|
|
gcc_assert (!next->num_nodes);
|
|
struct loop *ln = next->next;
|
|
delete_loop (next);
|
|
next = ln;
|
|
}
|
|
rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
|
|
}
|
|
|
|
/* Returns true if the distance in DDR can be determined and adjusts
|
|
the unroll factor in *UNROLL to make unrolling valid for that distance.
|
|
Otherwise return false.
|
|
|
|
If this data dep can lead to a removed memory reference, increment
|
|
*REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
|
|
for this to happen. */
|
|
|
|
static bool
|
|
adjust_unroll_factor (struct data_dependence_relation *ddr,
|
|
unsigned *unroll, unsigned *profit_unroll,
|
|
unsigned *removed)
|
|
{
|
|
bool ret = false;
|
|
if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
|
|
{
|
|
if (DDR_NUM_DIST_VECTS (ddr) == 0)
|
|
return false;
|
|
unsigned i;
|
|
lambda_vector dist_v;
|
|
FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
|
|
{
|
|
/* A distance (a,b) is at worst transformed into (a/N,b) by the
|
|
unrolling (factor N), so the transformation is valid if
|
|
a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
|
|
factor needs to be limited so that the first condition holds.
|
|
That may limit the factor down to zero in the worst case. */
|
|
int dist = dist_v[0];
|
|
if (dist < 0)
|
|
gcc_unreachable ();
|
|
else if ((unsigned)dist >= *unroll)
|
|
;
|
|
else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
|
|| (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
|
&& dist > 0))
|
|
;
|
|
else
|
|
*unroll = dist;
|
|
|
|
/* With a distance (a,0) it's always profitable to unroll-and-jam
|
|
(by a+1), because one memory reference will go away. With
|
|
(a,b) and b != 0 that's less clear. We will increase the
|
|
number of streams without lowering the number of mem refs.
|
|
So for now only handle the first situation. */
|
|
if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
|
|
{
|
|
*profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
|
|
(*removed)++;
|
|
}
|
|
|
|
ret = true;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/* Main entry point for the unroll-and-jam transformation
|
|
described above. */
|
|
|
|
static unsigned int
|
|
tree_loop_unroll_and_jam (void)
|
|
{
|
|
struct loop *loop;
|
|
bool changed = false;
|
|
|
|
gcc_assert (scev_initialized_p ());
|
|
|
|
/* Go through all innermost loops. */
|
|
FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
|
|
{
|
|
struct loop *outer = loop_outer (loop);
|
|
|
|
if (loop_depth (loop) < 2
|
|
|| optimize_loop_nest_for_size_p (outer))
|
|
continue;
|
|
|
|
if (!unroll_jam_possible_p (outer, loop))
|
|
continue;
|
|
|
|
vec<data_reference_p> datarefs;
|
|
vec<ddr_p> dependences;
|
|
unsigned unroll_factor, profit_unroll, removed;
|
|
struct tree_niter_desc desc;
|
|
bool unroll = false;
|
|
|
|
auto_vec<loop_p, 3> loop_nest;
|
|
dependences.create (10);
|
|
datarefs.create (10);
|
|
if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
|
|
&datarefs, &dependences))
|
|
{
|
|
if (dump_file && (dump_flags & TDF_DETAILS))
|
|
fprintf (dump_file, "Cannot analyze data dependencies\n");
|
|
free_data_refs (datarefs);
|
|
free_dependence_relations (dependences);
|
|
continue;
|
|
}
|
|
if (!datarefs.length ())
|
|
continue;
|
|
|
|
if (dump_file && (dump_flags & TDF_DETAILS))
|
|
dump_data_dependence_relations (dump_file, dependences);
|
|
|
|
unroll_factor = (unsigned)-1;
|
|
profit_unroll = 1;
|
|
removed = 0;
|
|
|
|
/* Check all dependencies. */
|
|
unsigned i;
|
|
struct data_dependence_relation *ddr;
|
|
FOR_EACH_VEC_ELT (dependences, i, ddr)
|
|
{
|
|
struct data_reference *dra, *drb;
|
|
|
|
/* If the refs are independend there's nothing to do. */
|
|
if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
|
|
continue;
|
|
dra = DDR_A (ddr);
|
|
drb = DDR_B (ddr);
|
|
/* Nothing interesting for the self dependencies. */
|
|
if (dra == drb)
|
|
continue;
|
|
|
|
/* Now check the distance vector, for determining a sensible
|
|
outer unroll factor, and for validity of merging the inner
|
|
loop copies. */
|
|
if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
|
|
&removed))
|
|
{
|
|
/* Couldn't get the distance vector. For two reads that's
|
|
harmless (we assume we should unroll). For at least
|
|
one write this means we can't check the dependence direction
|
|
and hence can't determine safety. */
|
|
|
|
if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
|
|
{
|
|
unroll_factor = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* We regard a user-specified minimum percentage of zero as a request
|
|
to ignore all profitability concerns and apply the transformation
|
|
always. */
|
|
if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
|
profit_unroll = 2;
|
|
else if (removed * 100 / datarefs.length ()
|
|
< (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
|
profit_unroll = 1;
|
|
if (unroll_factor > profit_unroll)
|
|
unroll_factor = profit_unroll;
|
|
if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
|
|
unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
|
|
unroll = (unroll_factor > 1
|
|
&& can_unroll_loop_p (outer, unroll_factor, &desc));
|
|
|
|
if (unroll)
|
|
{
|
|
if (dump_enabled_p ())
|
|
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
|
|
find_loop_location (outer),
|
|
"applying unroll and jam with factor %d\n",
|
|
unroll_factor);
|
|
initialize_original_copy_tables ();
|
|
tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
|
|
&desc);
|
|
free_original_copy_tables ();
|
|
fuse_loops (outer->inner);
|
|
changed = true;
|
|
}
|
|
|
|
loop_nest.release ();
|
|
free_dependence_relations (dependences);
|
|
free_data_refs (datarefs);
|
|
}
|
|
|
|
if (changed)
|
|
{
|
|
scev_reset ();
|
|
free_dominance_info (CDI_DOMINATORS);
|
|
return TODO_cleanup_cfg;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Pass boilerplate */
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_loop_jam =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"unrolljam", /* name */
|
|
OPTGROUP_LOOP, /* optinfo_flags */
|
|
TV_LOOP_JAM, /* tv_id */
|
|
PROP_cfg, /* properties_required */
|
|
0, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
0, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_loop_jam : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_loop_jam (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_loop_jam, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *) { return flag_unroll_jam != 0; }
|
|
virtual unsigned int execute (function *);
|
|
|
|
};
|
|
|
|
unsigned int
|
|
pass_loop_jam::execute (function *fun)
|
|
{
|
|
if (number_of_loops (fun) <= 1)
|
|
return 0;
|
|
|
|
return tree_loop_unroll_and_jam ();
|
|
}
|
|
|
|
}
|
|
|
|
gimple_opt_pass *
|
|
make_pass_loop_jam (gcc::context *ctxt)
|
|
{
|
|
return new pass_loop_jam (ctxt);
|
|
}
|