Add unroll and jam pass
* gimple-loop-jam.c: New file. * Makefile.in (OBJS): Add gimple-loop-jam.o. * common.opt (funroll-and-jam): New option. * opts.c (default_options_table): Add unroll-and-jam at -O3. * params.def (PARAM_UNROLL_JAM_MIN_PERCENT): New param. (PARAM_UNROLL_JAM_MAX_UNROLL): Ditto. * passes.def: Add pass_loop_jam. * timevar.def (TV_LOOP_JAM): Add. * tree-pass.h (make_pass_loop_jam): Declare. * cfgloop.c (flow_loop_tree_node_add): Add AFTER argument. * cfgloop.h (flow_loop_tree_node_add): Adjust declaration. * cfgloopmanip.c (duplicate_loop): Add AFTER argument, adjust call to flow_loop_tree_node_add. (duplicate_subloops, copy_loops_to): Append to sibling list. * cfgloopmanip.h: (duplicate_loop): Adjust declaration. * doc/invoke.texi (-funroll-and-jam): Document new option. (unroll-jam-min-percent, unroll-jam-max-unroll): Document new params. testsuite/ * gcc.dg/unroll-and-jam.c: New test. From-SVN: r255467
This commit is contained in:
parent
5a40ae3c3a
commit
1cc521f1a8
|
@ -1,3 +1,25 @@
|
|||
2017-12-07 Michael Matz <matz@suse.de>
|
||||
|
||||
Add unroll and jam pass
|
||||
|
||||
* gimple-loop-jam.c: New file.
|
||||
* Makefile.in (OBJS): Add gimple-loop-jam.o.
|
||||
* common.opt (funroll-and-jam): New option.
|
||||
* opts.c (default_options_table): Add unroll-and-jam at -O3.
|
||||
* params.def (PARAM_UNROLL_JAM_MIN_PERCENT): New param.
|
||||
(PARAM_UNROLL_JAM_MAX_UNROLL): Ditto.
|
||||
* passes.def: Add pass_loop_jam.
|
||||
* timevar.def (TV_LOOP_JAM): Add.
|
||||
* tree-pass.h (make_pass_loop_jam): Declare.
|
||||
* cfgloop.c (flow_loop_tree_node_add): Add AT argument.
|
||||
* cfgloop.h (flow_loop_tree_node_add): Adjust declaration.
|
||||
* cfgloopmanip.c (duplicate_loop): Add AT argument, adjust call
|
||||
to flow_loop_tree_node_add.
|
||||
(duplicate_subloops, copy_loops_to): Append to sibling list.
|
||||
* cfgloopmanip.h: (duplicate_loop): Adjust declaration.
|
||||
* doc/invoke.texi (-funroll-and-jam): Document new option.
|
||||
(unroll-jam-min-percent, unroll-jam-max-unroll): Document new params.
|
||||
|
||||
2017-12-07 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/83296
|
||||
|
|
|
@ -1302,6 +1302,7 @@ OBJS = \
|
|||
gimple-iterator.o \
|
||||
gimple-fold.o \
|
||||
gimple-laddress.o \
|
||||
gimple-loop-jam.o \
|
||||
gimple-low.o \
|
||||
gimple-pretty-print.o \
|
||||
gimple-ssa-backprop.o \
|
||||
|
|
|
@ -296,13 +296,25 @@ establish_preds (struct loop *loop, struct loop *father)
|
|||
|
||||
/* Add LOOP to the loop hierarchy tree where FATHER is father of the
|
||||
added loop. If LOOP has some children, take care of that their
|
||||
pred field will be initialized correctly. */
|
||||
pred field will be initialized correctly. If AFTER is non-null
|
||||
then it's expected it's a pointer into FATHERs inner sibling
|
||||
list and LOOP is added behind AFTER, otherwise it's added in front
|
||||
of FATHERs siblings. */
|
||||
|
||||
void
|
||||
flow_loop_tree_node_add (struct loop *father, struct loop *loop)
|
||||
flow_loop_tree_node_add (struct loop *father, struct loop *loop,
|
||||
struct loop *after)
|
||||
{
|
||||
loop->next = father->inner;
|
||||
father->inner = loop;
|
||||
if (after)
|
||||
{
|
||||
loop->next = after->next;
|
||||
after->next = loop;
|
||||
}
|
||||
else
|
||||
{
|
||||
loop->next = father->inner;
|
||||
father->inner = loop;
|
||||
}
|
||||
|
||||
establish_preds (loop, father);
|
||||
}
|
||||
|
|
|
@ -342,7 +342,8 @@ void rescan_loop_exit (edge, bool, bool);
|
|||
void sort_sibling_loops (function *);
|
||||
|
||||
/* Loop data structure manipulation/querying. */
|
||||
extern void flow_loop_tree_node_add (struct loop *, struct loop *);
|
||||
extern void flow_loop_tree_node_add (struct loop *, struct loop *,
|
||||
struct loop * = NULL);
|
||||
extern void flow_loop_tree_node_remove (struct loop *);
|
||||
extern bool flow_loop_nested_p (const struct loop *, const struct loop *);
|
||||
extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block);
|
||||
|
|
|
@ -1000,9 +1000,11 @@ copy_loop_info (struct loop *loop, struct loop *target)
|
|||
}
|
||||
|
||||
/* Copies copy of LOOP as subloop of TARGET loop, placing newly
|
||||
created loop into loops structure. */
|
||||
created loop into loops structure. If AFTER is non-null
|
||||
the new loop is added at AFTER->next, otherwise in front of TARGETs
|
||||
sibling list. */
|
||||
struct loop *
|
||||
duplicate_loop (struct loop *loop, struct loop *target)
|
||||
duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
|
||||
{
|
||||
struct loop *cloop;
|
||||
cloop = alloc_loop ();
|
||||
|
@ -1014,36 +1016,46 @@ duplicate_loop (struct loop *loop, struct loop *target)
|
|||
set_loop_copy (loop, cloop);
|
||||
|
||||
/* Add it to target. */
|
||||
flow_loop_tree_node_add (target, cloop);
|
||||
flow_loop_tree_node_add (target, cloop, after);
|
||||
|
||||
return cloop;
|
||||
}
|
||||
|
||||
/* Copies structure of subloops of LOOP into TARGET loop, placing
|
||||
newly created loops into loop tree. */
|
||||
newly created loops into loop tree at the end of TARGETs sibling
|
||||
list in the original order. */
|
||||
void
|
||||
duplicate_subloops (struct loop *loop, struct loop *target)
|
||||
{
|
||||
struct loop *aloop, *cloop;
|
||||
struct loop *aloop, *cloop, *tail;
|
||||
|
||||
for (tail = target->inner; tail && tail->next; tail = tail->next)
|
||||
;
|
||||
for (aloop = loop->inner; aloop; aloop = aloop->next)
|
||||
{
|
||||
cloop = duplicate_loop (aloop, target);
|
||||
cloop = duplicate_loop (aloop, target, tail);
|
||||
tail = cloop;
|
||||
gcc_assert(!tail->next);
|
||||
duplicate_subloops (aloop, cloop);
|
||||
}
|
||||
}
|
||||
|
||||
/* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
|
||||
into TARGET loop, placing newly created loops into loop tree. */
|
||||
into TARGET loop, placing newly created loops into loop tree adding
|
||||
them to TARGETs sibling list at the end in order. */
|
||||
static void
|
||||
copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
|
||||
{
|
||||
struct loop *aloop;
|
||||
struct loop *aloop, *tail;
|
||||
int i;
|
||||
|
||||
for (tail = target->inner; tail && tail->next; tail = tail->next)
|
||||
;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
aloop = duplicate_loop (copied_loops[i], target);
|
||||
aloop = duplicate_loop (copied_loops[i], target, tail);
|
||||
tail = aloop;
|
||||
gcc_assert(!tail->next);
|
||||
duplicate_subloops (copied_loops[i], aloop);
|
||||
}
|
||||
}
|
||||
|
@ -1072,14 +1084,15 @@ can_duplicate_loop_p (const struct loop *loop)
|
|||
}
|
||||
|
||||
/* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating
|
||||
loop structure and dominators. E's destination must be LOOP header for
|
||||
this to work, i.e. it must be entry or latch edge of this loop; these are
|
||||
unique, as the loops must have preheaders for this function to work
|
||||
correctly (in case E is latch, the function unrolls the loop, if E is entry
|
||||
edge, it peels the loop). Store edges created by copying ORIG edge from
|
||||
copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to
|
||||
original LOOP body, the other copies are numbered in order given by control
|
||||
flow through them) into TO_REMOVE array. Returns false if duplication is
|
||||
loop structure and dominators (order of inner subloops is retained).
|
||||
E's destination must be LOOP header for this to work, i.e. it must be entry
|
||||
or latch edge of this loop; these are unique, as the loops must have
|
||||
preheaders for this function to work correctly (in case E is latch, the
|
||||
function unrolls the loop, if E is entry edge, it peels the loop). Store
|
||||
edges created by copying ORIG edge from copies corresponding to set bits in
|
||||
WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
|
||||
are numbered in order given by control flow through them) into TO_REMOVE
|
||||
array. Returns false if duplication is
|
||||
impossible. */
|
||||
|
||||
bool
|
||||
|
|
|
@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge,
|
|||
profile_probability, profile_probability);
|
||||
extern void unloop (struct loop *, bool *, bitmap);
|
||||
extern void copy_loop_info (struct loop *loop, struct loop *target);
|
||||
extern struct loop * duplicate_loop (struct loop *, struct loop *);
|
||||
extern struct loop * duplicate_loop (struct loop *, struct loop *,
|
||||
struct loop * = NULL);
|
||||
extern void duplicate_subloops (struct loop *, struct loop *);
|
||||
extern bool can_duplicate_loop_p (const struct loop *loop);
|
||||
extern bool duplicate_loop_to_header_edge (struct loop *, edge,
|
||||
|
|
|
@ -2695,6 +2695,10 @@ fsplit-loops
|
|||
Common Report Var(flag_split_loops) Optimization
|
||||
Perform loop splitting.
|
||||
|
||||
funroll-and-jam
|
||||
Common Report Var(flag_unroll_jam) Optimization
|
||||
Perform unroll-and-jam on loops.
|
||||
|
||||
funwind-tables
|
||||
Common Report Var(flag_unwind_tables) Optimization
|
||||
Just generate unwind tables for exception handling.
|
||||
|
|
|
@ -437,7 +437,7 @@ Objective-C and Objective-C++ Dialects}.
|
|||
-ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
|
||||
-ftree-switch-conversion -ftree-tail-merge @gol
|
||||
-ftree-ter -ftree-vectorize -ftree-vrp -funconstrained-commons @gol
|
||||
-funit-at-a-time -funroll-all-loops -funroll-loops @gol
|
||||
-funit-at-a-time -funroll-all-loops -funroll-loops -funroll-and-jam @gol
|
||||
-funsafe-math-optimizations -funswitch-loops @gol
|
||||
-fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
|
||||
-fweb -fwhole-program -fwpa -fuse-linker-plugin @gol
|
||||
|
@ -9771,6 +9771,12 @@ for one side of the iteration space and false for the other.
|
|||
Move branches with loop invariant conditions out of the loop, with duplicates
|
||||
of the loop on both branches (modified according to result of the condition).
|
||||
|
||||
@item -funroll-and-jam
|
||||
@opindex funroll-and-jam
|
||||
Apply unroll and jam transoformations on feasible loops. In a loop
|
||||
nest this unrolls the outer loop by some factor and fuses the resulting
|
||||
multiple inner loops.
|
||||
|
||||
@item -ffunction-sections
|
||||
@itemx -fdata-sections
|
||||
@opindex ffunction-sections
|
||||
|
@ -10838,6 +10844,14 @@ we may be able to devirtualize speculatively.
|
|||
@item max-vrp-switch-assertions
|
||||
The maximum number of assertions to add along the default edge of a switch
|
||||
statement during VRP. The default is 10.
|
||||
|
||||
@item unroll-jam-min-percent
|
||||
The minimum percentage of memory references that must be optimized
|
||||
away for the unroll-and-jam transformation to be considered profitable.
|
||||
|
||||
@item unroll-jam-max-unroll
|
||||
The maximum number of times the outer loop should be unrolled by
|
||||
the unroll-and-jam transformation.
|
||||
@end table
|
||||
@end table
|
||||
|
||||
|
|
|
@ -0,0 +1,569 @@
|
|||
/* Loop unroll-and-jam.
|
||||
Copyright (C) 2017 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU General Public License as published by the
|
||||
Free Software Foundation; either version 3, or (at your option) any
|
||||
later version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT
|
||||
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING3. If not see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "config.h"
|
||||
#include "system.h"
|
||||
#include "coretypes.h"
|
||||
#include "params.h"
|
||||
#include "tree-pass.h"
|
||||
#include "backend.h"
|
||||
#include "tree.h"
|
||||
#include "gimple.h"
|
||||
#include "ssa.h"
|
||||
#include "fold-const.h"
|
||||
#include "tree-cfg.h"
|
||||
#include "tree-ssa.h"
|
||||
#include "tree-ssa-loop-niter.h"
|
||||
#include "tree-ssa-loop.h"
|
||||
#include "tree-ssa-loop-manip.h"
|
||||
#include "cfgloop.h"
|
||||
#include "tree-scalar-evolution.h"
|
||||
#include "gimple-iterator.h"
|
||||
#include "cfghooks.h"
|
||||
#include "tree-data-ref.h"
|
||||
#include "tree-ssa-loop-ivopts.h"
|
||||
#include "tree-vectorizer.h"
|
||||
|
||||
/* Unroll and Jam transformation
|
||||
|
||||
This is a combination of two transformations, where the second
|
||||
is not always valid. It's applicable if a loop nest has redundancies
|
||||
over the iterations of an outer loop while not having that with
|
||||
an inner loop.
|
||||
|
||||
Given this nest:
|
||||
for (i) {
|
||||
for (j) {
|
||||
B(i,j)
|
||||
}
|
||||
}
|
||||
|
||||
first unroll:
|
||||
for (i by 2) {
|
||||
for (j) {
|
||||
B(i,j)
|
||||
}
|
||||
for (j) {
|
||||
B(i+1,j)
|
||||
}
|
||||
}
|
||||
|
||||
then fuse the two adjacent inner loops resulting from that:
|
||||
for (i by 2) {
|
||||
for (j) {
|
||||
B(i,j)
|
||||
B(i+1,j)
|
||||
}
|
||||
}
|
||||
|
||||
As the order of evaluations of the body B changes this is valid
|
||||
only in certain situations: all distance vectors need to be forward.
|
||||
Additionally if there are multiple induction variables than just
|
||||
a counting control IV (j above) we can also deal with some situations.
|
||||
|
||||
The validity is checked by unroll_jam_possible_p, and the data-dep
|
||||
testing below.
|
||||
|
||||
A trivial example where the fusion is wrong would be when
|
||||
B(i,j) == x[j-1] = x[j];
|
||||
for (i by 2) {
|
||||
for (j) {
|
||||
x[j-1] = x[j];
|
||||
}
|
||||
for (j) {
|
||||
x[j-1] = x[j];
|
||||
}
|
||||
} effect: move content to front by two elements
|
||||
-->
|
||||
for (i by 2) {
|
||||
for (j) {
|
||||
x[j-1] = x[j];
|
||||
x[j-1] = x[j];
|
||||
}
|
||||
} effect: move content to front by one element
|
||||
*/
|
||||
|
||||
/* Modify the loop tree for the fact that all code once belonging
|
||||
to the OLD loop or the outer loop of OLD now is inside LOOP. */
|
||||
|
||||
static void
|
||||
merge_loop_tree (struct loop *loop, struct loop *old)
|
||||
{
|
||||
basic_block *bbs;
|
||||
int i, n;
|
||||
struct loop *subloop;
|
||||
edge e;
|
||||
edge_iterator ei;
|
||||
|
||||
/* Find its nodes. */
|
||||
bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
||||
n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
/* If the block was direct child of OLD loop it's now part
|
||||
of LOOP. If it was outside OLD, then it moved into LOOP
|
||||
as well. This avoids changing the loop father for BBs
|
||||
in inner loops of OLD. */
|
||||
if (bbs[i]->loop_father == old
|
||||
|| loop_depth (bbs[i]->loop_father) < loop_depth (old))
|
||||
{
|
||||
remove_bb_from_loops (bbs[i]);
|
||||
add_bb_to_loop (bbs[i], loop);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If we find a direct subloop of OLD, move it to LOOP. */
|
||||
subloop = bbs[i]->loop_father;
|
||||
if (loop_outer (subloop) == old && subloop->header == bbs[i])
|
||||
{
|
||||
flow_loop_tree_node_remove (subloop);
|
||||
flow_loop_tree_node_add (loop, subloop);
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the information about loop exit edges. */
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
FOR_EACH_EDGE (e, ei, bbs[i]->succs)
|
||||
{
|
||||
rescan_loop_exit (e, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
loop->num_nodes = n;
|
||||
|
||||
free (bbs);
|
||||
}
|
||||
|
||||
/* BB exits the outer loop of an unroll-and-jam situation.
|
||||
Check if any statements therein would prevent the transformation. */
|
||||
|
||||
static bool
|
||||
bb_prevents_fusion_p (basic_block bb)
|
||||
{
|
||||
gimple_stmt_iterator gsi;
|
||||
/* BB is duplicated by outer unrolling and then all N-1 first copies
|
||||
move into the body of the fused inner loop. The last copy remains
|
||||
the exit block of the outer loop and is still outside the inner loop
|
||||
also after fusion. We can't allow this for some effects of BB:
|
||||
* stores or unknown side-effects prevent fusion
|
||||
* loads don't
|
||||
* computations into SSA names: these aren't problematic. Their
|
||||
result will be unused on the exit edges of the first N-1 copies
|
||||
(those aren't taken after unrolling). If they are used on the
|
||||
other edge (the one leading to the outer latch block) they are
|
||||
loop-carried (on the outer loop) and the Nth copy of BB will
|
||||
compute them again (i.e. the first N-1 copies will be dead). */
|
||||
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
||||
{
|
||||
gimple *g = gsi_stmt (gsi);
|
||||
if (gimple_vdef (g) || gimple_has_side_effects (g))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Given an inner loop LOOP (of some OUTER loop) determine if
|
||||
we can safely fuse copies of it (generated by outer unrolling).
|
||||
If so return true, otherwise return false. */
|
||||
|
||||
static bool
|
||||
unroll_jam_possible_p (struct loop *outer, struct loop *loop)
|
||||
{
|
||||
basic_block *bbs;
|
||||
int i, n;
|
||||
struct tree_niter_desc niter;
|
||||
|
||||
/* When fusing the loops we skip the latch block
|
||||
of the first one, so it mustn't have any effects to
|
||||
preserve. */
|
||||
if (!empty_block_p (loop->latch))
|
||||
return false;
|
||||
|
||||
if (!single_exit (loop))
|
||||
return false;
|
||||
|
||||
/* We need a perfect nest. Quick check for adjacent inner loops. */
|
||||
if (outer->inner != loop || loop->next)
|
||||
return false;
|
||||
|
||||
/* The number of iterations of the inner loop must be loop invariant
|
||||
with respect to the outer loop. */
|
||||
if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
|
||||
false, true)
|
||||
|| niter.cmp == ERROR_MARK
|
||||
|| !integer_zerop (niter.may_be_zero)
|
||||
|| !expr_invariant_in_loop_p (outer, niter.niter))
|
||||
return false;
|
||||
|
||||
/* And check blocks belonging to just outer loop. */
|
||||
bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
||||
n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
if (bbs[i]->loop_father == outer
|
||||
&& bbs[i] != outer->latch && bbs[i] != outer->header
|
||||
&& (!loop_exits_from_bb_p (outer, bbs[i])
|
||||
|| bb_prevents_fusion_p (bbs[i])))
|
||||
break;
|
||||
/* XXX Note that the above disallows head-controlled inner loops,
|
||||
that we usually have. The guard block would need to be accepted
|
||||
(invariant condition either entering or skipping the loop),
|
||||
without also accepting arbitrary control flow. When unswitching
|
||||
ran before us (as with -O3) this won't be a problem because its
|
||||
outer loop unswitching will have moved out the invariant condition.
|
||||
|
||||
If we do that we need to extend fuse_loops() to cope with this
|
||||
by threading through the (still invariant) copied condition
|
||||
between the two loop copies. */
|
||||
}
|
||||
free (bbs);
|
||||
if (i != n)
|
||||
return false;
|
||||
|
||||
/* For now we can safely fuse copies of LOOP only if all
|
||||
loop carried variables are inductions (or the virtual op).
|
||||
|
||||
We could handle reductions as well (the initial value in the second
|
||||
body would be the after-iter value of the first body) if it's over
|
||||
an associative and commutative operation. We wouldn't
|
||||
be able to handle unknown cycles. */
|
||||
gphi_iterator psi;
|
||||
for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
|
||||
{
|
||||
affine_iv iv;
|
||||
tree op = gimple_phi_result (psi.phi ());
|
||||
|
||||
if (virtual_operand_p (op))
|
||||
continue;
|
||||
if (!simple_iv (loop, loop, op, &iv, true))
|
||||
return false;
|
||||
/* The inductions must be regular, loop invariant step and initial
|
||||
value. */
|
||||
if (!expr_invariant_in_loop_p (outer, iv.step)
|
||||
|| !expr_invariant_in_loop_p (outer, iv.base))
|
||||
return false;
|
||||
/* XXX With more effort we could also be able to deal with inductions
|
||||
where the initial value is loop variant but a simple IV in the
|
||||
outer loop. The initial value for the second body would be
|
||||
the original initial value plus iv.base.step. The next value
|
||||
for the fused loop would be the original next value of the first
|
||||
copy, _not_ the next value of the second body. */
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Fuse LOOP with all further neighbors. The loops are expected to
|
||||
be in appropriate form. */
|
||||
|
||||
static void
|
||||
fuse_loops (struct loop *loop)
|
||||
{
|
||||
struct loop *next = loop->next;
|
||||
|
||||
while (next)
|
||||
{
|
||||
edge e;
|
||||
|
||||
remove_branch (single_pred_edge (loop->latch));
|
||||
/* Make delete_basic_block not fiddle with the loop structure. */
|
||||
basic_block oldlatch = loop->latch;
|
||||
loop->latch = NULL;
|
||||
delete_basic_block (oldlatch);
|
||||
e = redirect_edge_and_branch (loop_latch_edge (next),
|
||||
loop->header);
|
||||
loop->latch = e->src;
|
||||
flush_pending_stmts (e);
|
||||
|
||||
gcc_assert (EDGE_COUNT (next->header->preds) == 1);
|
||||
|
||||
/* The PHI nodes of the second body (single-argument now)
|
||||
need adjustments to use the right values: either directly
|
||||
the value of the corresponding PHI in the first copy or
|
||||
the one leaving the first body which unrolling did for us.
|
||||
|
||||
See also unroll_jam_possible_p() for further possibilities. */
|
||||
gphi_iterator psi_first, psi_second;
|
||||
e = single_pred_edge (next->header);
|
||||
for (psi_first = gsi_start_phis (loop->header),
|
||||
psi_second = gsi_start_phis (next->header);
|
||||
!gsi_end_p (psi_first);
|
||||
gsi_next (&psi_first), gsi_next (&psi_second))
|
||||
{
|
||||
gphi *phi_first = psi_first.phi ();
|
||||
gphi *phi_second = psi_second.phi ();
|
||||
tree firstop = gimple_phi_result (phi_first);
|
||||
/* The virtual operand is correct already as it's
|
||||
always live at exit, hence has a LCSSA node and outer
|
||||
loop unrolling updated SSA form. */
|
||||
if (virtual_operand_p (firstop))
|
||||
continue;
|
||||
|
||||
/* Due to unroll_jam_possible_p() we know that this is
|
||||
an induction. The second body goes over the same
|
||||
iteration space. */
|
||||
add_phi_arg (phi_second, firstop, e,
|
||||
gimple_location (phi_first));
|
||||
}
|
||||
gcc_assert (gsi_end_p (psi_second));
|
||||
|
||||
merge_loop_tree (loop, next);
|
||||
gcc_assert (!next->num_nodes);
|
||||
struct loop *ln = next->next;
|
||||
delete_loop (next);
|
||||
next = ln;
|
||||
}
|
||||
rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
|
||||
}
|
||||
|
||||
/* Returns true if the distance in DDR can be determined and adjusts
|
||||
the unroll factor in *UNROLL to make unrolling valid for that distance.
|
||||
Otherwise return false.
|
||||
|
||||
If this data dep can lead to a removed memory reference, increment
|
||||
*REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
|
||||
for this to happen. */
|
||||
|
||||
static bool
|
||||
adjust_unroll_factor (struct data_dependence_relation *ddr,
|
||||
unsigned *unroll, unsigned *profit_unroll,
|
||||
unsigned *removed)
|
||||
{
|
||||
bool ret = false;
|
||||
if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
|
||||
{
|
||||
if (DDR_NUM_DIST_VECTS (ddr) == 0)
|
||||
return false;
|
||||
unsigned i;
|
||||
lambda_vector dist_v;
|
||||
FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
|
||||
{
|
||||
/* A distance (a,b) is at worst transformed into (a/N,b) by the
|
||||
unrolling (factor N), so the transformation is valid if
|
||||
a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
|
||||
factor needs to be limited so that the first condition holds.
|
||||
That may limit the factor down to zero in the worst case. */
|
||||
int dist = dist_v[0];
|
||||
if (dist < 0)
|
||||
gcc_unreachable ();
|
||||
else if ((unsigned)dist >= *unroll)
|
||||
;
|
||||
else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
||||
|| (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
||||
&& dist > 0))
|
||||
;
|
||||
else
|
||||
*unroll = dist;
|
||||
|
||||
/* With a distance (a,0) it's always profitable to unroll-and-jam
|
||||
(by a+1), because one memory reference will go away. With
|
||||
(a,b) and b != 0 that's less clear. We will increase the
|
||||
number of streams without lowering the number of mem refs.
|
||||
So for now only handle the first situation. */
|
||||
if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
|
||||
{
|
||||
*profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
|
||||
(*removed)++;
|
||||
}
|
||||
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Main entry point for the unroll-and-jam transformation
|
||||
described above. */
|
||||
|
||||
static unsigned int
|
||||
tree_loop_unroll_and_jam (void)
|
||||
{
|
||||
struct loop *loop;
|
||||
bool changed = false;
|
||||
|
||||
gcc_assert (scev_initialized_p ());
|
||||
|
||||
/* Go through all innermost loops. */
|
||||
FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
|
||||
{
|
||||
struct loop *outer = loop_outer (loop);
|
||||
|
||||
if (loop_depth (loop) < 2
|
||||
|| optimize_loop_nest_for_size_p (outer))
|
||||
continue;
|
||||
|
||||
if (!unroll_jam_possible_p (outer, loop))
|
||||
continue;
|
||||
|
||||
vec<data_reference_p> datarefs;
|
||||
vec<ddr_p> dependences;
|
||||
unsigned unroll_factor, profit_unroll, removed;
|
||||
struct tree_niter_desc desc;
|
||||
bool unroll = false;
|
||||
|
||||
auto_vec<loop_p, 3> loop_nest;
|
||||
dependences.create (10);
|
||||
datarefs.create (10);
|
||||
if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
|
||||
&datarefs, &dependences))
|
||||
{
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "Cannot analyze data dependencies\n");
|
||||
free_data_refs (datarefs);
|
||||
free_dependence_relations (dependences);
|
||||
return false;
|
||||
}
|
||||
if (!datarefs.length ())
|
||||
continue;
|
||||
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
dump_data_dependence_relations (dump_file, dependences);
|
||||
|
||||
unroll_factor = (unsigned)-1;
|
||||
profit_unroll = 1;
|
||||
removed = 0;
|
||||
|
||||
/* Check all dependencies. */
|
||||
unsigned i;
|
||||
struct data_dependence_relation *ddr;
|
||||
FOR_EACH_VEC_ELT (dependences, i, ddr)
|
||||
{
|
||||
struct data_reference *dra, *drb;
|
||||
|
||||
/* If the refs are independend there's nothing to do. */
|
||||
if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
|
||||
continue;
|
||||
dra = DDR_A (ddr);
|
||||
drb = DDR_B (ddr);
|
||||
/* Nothing interesting for the self dependencies. */
|
||||
if (dra == drb)
|
||||
continue;
|
||||
|
||||
/* Now check the distance vector, for determining a sensible
|
||||
outer unroll factor, and for validity of merging the inner
|
||||
loop copies. */
|
||||
if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
|
||||
&removed))
|
||||
{
|
||||
/* Couldn't get the distance vector. For two reads that's
|
||||
harmless (we assume we should unroll). For at least
|
||||
one write this means we can't check the dependence direction
|
||||
and hence can't determine safety. */
|
||||
|
||||
if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
|
||||
{
|
||||
unroll_factor = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We regard a user-specified minimum percentage of zero as a request
|
||||
to ignore all profitability concerns and apply the transformation
|
||||
always. */
|
||||
if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
||||
profit_unroll = 2;
|
||||
else if (removed * 100 / datarefs.length ()
|
||||
< (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
||||
profit_unroll = 1;
|
||||
if (unroll_factor > profit_unroll)
|
||||
unroll_factor = profit_unroll;
|
||||
if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
|
||||
unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
|
||||
unroll = (unroll_factor > 1
|
||||
&& can_unroll_loop_p (outer, unroll_factor, &desc));
|
||||
|
||||
if (unroll)
|
||||
{
|
||||
if (dump_enabled_p ())
|
||||
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
|
||||
find_loop_location (outer),
|
||||
"applying unroll and jam with factor %d\n",
|
||||
unroll_factor);
|
||||
initialize_original_copy_tables ();
|
||||
tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
|
||||
&desc);
|
||||
free_original_copy_tables ();
|
||||
fuse_loops (outer->inner);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
loop_nest.release ();
|
||||
free_dependence_relations (dependences);
|
||||
free_data_refs (datarefs);
|
||||
}
|
||||
|
||||
if (changed)
|
||||
{
|
||||
scev_reset ();
|
||||
free_dominance_info (CDI_DOMINATORS);
|
||||
return TODO_cleanup_cfg;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Pass boilerplate */
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_loop_jam =
|
||||
{
|
||||
GIMPLE_PASS, /* type */
|
||||
"unrolljam", /* name */
|
||||
OPTGROUP_LOOP, /* optinfo_flags */
|
||||
TV_LOOP_JAM, /* tv_id */
|
||||
PROP_cfg, /* properties_required */
|
||||
0, /* properties_provided */
|
||||
0, /* properties_destroyed */
|
||||
0, /* todo_flags_start */
|
||||
0, /* todo_flags_finish */
|
||||
};
|
||||
|
||||
class pass_loop_jam : public gimple_opt_pass
|
||||
{
|
||||
public:
|
||||
pass_loop_jam (gcc::context *ctxt)
|
||||
: gimple_opt_pass (pass_data_loop_jam, ctxt)
|
||||
{}
|
||||
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *) { return flag_unroll_jam != 0; }
|
||||
virtual unsigned int execute (function *);
|
||||
|
||||
};
|
||||
|
||||
unsigned int
|
||||
pass_loop_jam::execute (function *fun)
|
||||
{
|
||||
if (number_of_loops (fun) <= 1)
|
||||
return 0;
|
||||
|
||||
return tree_loop_unroll_and_jam ();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
gimple_opt_pass *
|
||||
make_pass_loop_jam (gcc::context *ctxt)
|
||||
{
|
||||
return new pass_loop_jam (ctxt);
|
||||
}
|
|
@ -535,6 +535,7 @@ static const struct default_options default_options_table[] =
|
|||
{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_funroll_and_jam, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
|
||||
{ OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
|
||||
|
|
|
@ -1293,6 +1293,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
|
|||
"Enable loop epilogue vectorization using smaller vector size.",
|
||||
0, 0, 1)
|
||||
|
||||
DEFPARAM(PARAM_UNROLL_JAM_MIN_PERCENT,
|
||||
"unroll-jam-min-percent",
|
||||
"Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.",
|
||||
1, 0, 100)
|
||||
|
||||
DEFPARAM(PARAM_UNROLL_JAM_MAX_UNROLL,
|
||||
"unroll-jam-max-unroll",
|
||||
"Maximum unroll factor for the unroll-and-jam transformation.",
|
||||
4, 0, 0)
|
||||
|
||||
/*
|
||||
|
||||
Local variables:
|
||||
|
|
|
@ -273,6 +273,7 @@ along with GCC; see the file COPYING3. If not see
|
|||
NEXT_PASS (pass_tree_unswitch);
|
||||
NEXT_PASS (pass_scev_cprop);
|
||||
NEXT_PASS (pass_loop_split);
|
||||
NEXT_PASS (pass_loop_jam);
|
||||
/* All unswitching, final value replacement and splitting can expose
|
||||
empty loops. Remove them now. */
|
||||
NEXT_PASS (pass_cd_dce);
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
2017-12-07 Michael Matz <matz@suse.de>
|
||||
|
||||
* gcc.dg/unroll-and-jam.c: New test.
|
||||
|
||||
2017-12-07 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/83296
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
/* { dg-do run } */
|
||||
/* { dg-options "-O3 -funroll-and-jam --param unroll-jam-min-percent=0 -fdump-tree-unrolljam-details" } */
|
||||
/* { dg-require-effective-target int32plus } */
|
||||
|
||||
#include <stdio.h>
|
||||
extern unsigned int a[];
|
||||
extern unsigned int b[];
|
||||
extern unsigned int aa[][1024];
|
||||
unsigned int checksum;
|
||||
void checkaa(void)
|
||||
{
|
||||
unsigned sum = 1;
|
||||
unsigned long i, j;
|
||||
for (i = 0; i < 1024; i++) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
sum += aa[j][i]*31+47;
|
||||
}
|
||||
}
|
||||
checksum = checksum * 27 + sum;
|
||||
//printf(" %d\n", sum);
|
||||
}
|
||||
|
||||
void checkb(void)
|
||||
{
|
||||
unsigned sum = 1;
|
||||
unsigned long i, j;
|
||||
for (i = 0; i < 1024; i++) {
|
||||
sum += b[i]*31+47;
|
||||
}
|
||||
checksum = checksum * 27 + sum;
|
||||
//printf(" %d\n", sum);
|
||||
}
|
||||
|
||||
#define TEST(name, body, test) \
|
||||
static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
|
||||
{ \
|
||||
unsigned long i, j; \
|
||||
for (i = 1; i < m; i++) { \
|
||||
for (j = 1; j < n; j++) { \
|
||||
body; \
|
||||
} \
|
||||
} \
|
||||
test; \
|
||||
} \
|
||||
static void __attribute__((noinline,noclone,optimize("O1"))) name ## noopt (unsigned long n, unsigned long m) \
|
||||
{ \
|
||||
unsigned long i, j; \
|
||||
for (i = 1; i < m; i++) { \
|
||||
for (j = 1; j < n; j++) { \
|
||||
body; \
|
||||
} \
|
||||
} \
|
||||
test; \
|
||||
}
|
||||
TEST(foo1, aa[i+1][j+1]=aa[i][j] * aa[i][j] / 2, checkaa()) //ok, -1,-1
|
||||
TEST(foo2, aa[i][j+1]=3*aa[i+1][j], checkaa()) //notok, 1,-1
|
||||
TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
|
||||
TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
|
||||
TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
|
||||
TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
|
||||
TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
|
||||
TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
|
||||
TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
|
||||
|
||||
/* foo8 should work as well, but currently doesn't because the distance
|
||||
vectors we compute are too pessimistic. We compute
|
||||
(0,1), (1,1) and (1,-1)
|
||||
and the last one causes us to lose. */
|
||||
TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
|
||||
|
||||
unsigned int a[1024];
|
||||
unsigned int b[1024];
|
||||
unsigned int aa[16][1024];
|
||||
void init(void)
|
||||
{
|
||||
unsigned long i,j;
|
||||
for (i = 0; i < 1024; i++) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
aa[j][i] = ((j+1)*2+i+1) % 17;
|
||||
}
|
||||
a[i] = ((i+1)*31) % 19;
|
||||
b[i] = ((i+1)*47) % 23;
|
||||
}
|
||||
checksum = 1;
|
||||
}
|
||||
|
||||
#define RUN(name) \
|
||||
printf(" %s\n", #name); \
|
||||
init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
|
||||
init();for(i=0;i<4;i++)name(32,8); \
|
||||
printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
|
||||
|
||||
int main()
|
||||
{
|
||||
int i;
|
||||
unsigned checka;
|
||||
RUN(foo1);
|
||||
RUN(foo2);
|
||||
RUN(foo3);
|
||||
RUN(foo4);
|
||||
RUN(foo5);
|
||||
RUN(foo6);
|
||||
RUN(foo7);
|
||||
RUN(foo8);
|
||||
RUN(foo9);
|
||||
RUN(foo10);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Five loops should be unroll-jammed (actually six, but see above). */
|
||||
/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
|
|
@ -188,6 +188,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "tree canonical iv")
|
|||
DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop")
|
||||
DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
|
||||
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
|
||||
DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
|
||||
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
|
||||
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
|
||||
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
|
||||
|
|
|
@ -370,6 +370,7 @@ extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt);
|
|||
extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);
|
||||
|
|
Loading…
Reference in New Issue