tree-loop-distribution.c: Adjust the general comment.

* tree-loop-distribution.c: Adjust the general comment.
	(NUM_PARTITION_THRESHOLD): New macro.
	(ssa_name_has_uses_outside_loop_p): Support loop nest distribution.
	(classify_partition): Skip builtin pattern of loop nest's inner loop.
	(merge_dep_scc_partitions): New parameter ignore_alias_p and use it
	in call to build_partition_graph.
	(finalize_partitions): New parameter.  Make loop distribution more
	conservative by fusing more partitions.
	(distribute_loop): Don't do runtime alias check in case of loop nest
	distribution.
	(find_seed_stmts_for_distribution): New function.
	(prepare_perfect_loop_nest): New function.
	(pass_loop_distribution::execute): Refactor code finding seed stmts
	and loop nest into above functions.  Support loop nest distribution.
	Adjust dump information accordingly.

	gcc/testsuite
	* gcc.dg/tree-ssa/ldist-7.c: Adjust test string.
	* gcc.dg/tree-ssa/ldist-16.c: Ditto.
	* gcc.dg/tree-ssa/ldist-25.c: Ditto.
	* gcc.dg/tree-ssa/ldist-33.c: New test.

From-SVN: r253679
This commit is contained in:
Bin Cheng 2017-10-12 14:29:45 +00:00 committed by Bin Cheng
parent 6dc29d3adc
commit 163aa51b70
7 changed files with 230 additions and 105 deletions

View File

@ -1,3 +1,21 @@
2017-10-12 Bin Cheng <bin.cheng@arm.com>
* tree-loop-distribution.c: Adjust the general comment.
(NUM_PARTITION_THRESHOLD): New macro.
(ssa_name_has_uses_outside_loop_p): Support loop nest distribution.
(classify_partition): Skip builtin pattern of loop nest's inner loop.
(merge_dep_scc_partitions): New parameter ignore_alias_p and use it
in call to build_partition_graph.
(finalize_partitions): New parameter. Make loop distribution more
conservative by fusing more partitions.
(distribute_loop): Don't do runtime alias check in case of loop nest
distribution.
(find_seed_stmts_for_distribution): New function.
(prepare_perfect_loop_nest): New function.
(pass_loop_distribution::execute): Refactor code finding seed stmts
and loop nest into above functions. Support loop nest distribution.
Adjust dump information accordingly.
2017-10-12 Bin Cheng <bin.cheng@arm.com>
* tree-loop-distribution.c (break_alias_scc_partitions): Add comment

View File

@ -1,3 +1,10 @@
2017-10-12 Bin Cheng <bin.cheng@arm.com>
* gcc.dg/tree-ssa/ldist-7.c: Adjust test string.
* gcc.dg/tree-ssa/ldist-16.c: Ditto.
* gcc.dg/tree-ssa/ldist-25.c: Ditto.
* gcc.dg/tree-ssa/ldist-33.c: New test.
2017-10-12 Richard Biener <rguenther@suse.de>
PR tree-optimization/69728

View File

@ -16,5 +16,5 @@ void foo (int n)
/* We should not apply loop distribution and not generate a memset (0). */
/* { dg-final { scan-tree-dump "Loop 1 is the same" "ldist" } } */
/* { dg-final { scan-tree-dump "Loop 1 not distributed" "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated memset zero" 0 "ldist" } } */

View File

@ -22,4 +22,4 @@ foo (void)
}
}
/* { dg-final { scan-tree-dump "Loop . is the same" "ldist" } } */
/* { dg-final { scan-tree-dump "Loop . not distributed" "ldist" } } */

View File

@ -0,0 +1,21 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */
#define N (1024)
double a[N][N], b[N][N], c[N][N];
void
foo (void)
{
unsigned i, j, k;
for (i = 0; i < N; ++i)
for (j = 0; j < N; ++j)
{
c[i][j] = 0.0;
for (k = 0; k < N; ++k)
c[i][j] += a[i][k] * b[k][j];
}
}
/* { dg-final { scan-tree-dump "Loop nest . distributed: split to 1 loops and 1 library" "ldist" } } */

View File

@ -28,4 +28,4 @@ int loop1 (int k)
return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2];
}
/* { dg-final { scan-tree-dump-times "distributed" 0 "ldist" } } */
/* { dg-final { scan-tree-dump-times "distributed: " 0 "ldist" } } */

View File

@ -83,8 +83,8 @@ along with GCC; see the file COPYING3. If not see
loops and recover to the original one.
TODO:
1) We only distribute innermost loops now. This pass should handle loop
nests in the future.
1) We only distribute innermost two-level loop nest now. We should
extend it for arbitrary loop nests in the future.
2) We only fuse partitions in SCC now. A better fusion algorithm is
desired to minimize loop overhead, maximize parallelism and maximize
data reuse. */
@ -118,6 +118,11 @@ along with GCC; see the file COPYING3. If not see
#define MAX_DATAREFS_NUM \
((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
/* Threshold controlling number of distributed partitions. Given it may
be unnecessary if a memory stream cost model is invented in the future,
we define it as a temporary macro, rather than a parameter. */
#define NUM_PARTITION_THRESHOLD (4)
/* Hashtable helpers. */
struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
@ -714,9 +719,11 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
{
gimple *use_stmt = USE_STMT (use_p);
if (!is_gimple_debug (use_stmt)
&& loop != loop_containing_stmt (use_stmt))
if (is_gimple_debug (USE_STMT (use_p)))
continue;
basic_block use_bb = gimple_bb (USE_STMT (use_p));
if (!flow_bb_inside_loop_p (loop, use_bb))
return true;
}
@ -1414,6 +1421,22 @@ classify_partition (loop_p loop, struct graph *rdg, partition *partition,
if (!single_store)
return;
/* TODO: We don't support memset/memcpy distribution for loop nest yet. */
if (loop->inner)
{
basic_block bb = gimple_bb (DR_STMT (single_store));
if (bb->loop_father != loop)
return;
if (single_load)
{
bb = gimple_bb (DR_STMT (single_load));
if (bb->loop_father != loop)
return;
}
}
nb_iter = number_of_latch_executions (loop);
gcc_assert (nb_iter && nb_iter != chrec_dont_know);
if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
@ -1965,16 +1988,18 @@ sort_partitions_by_post_order (struct graph *pg,
}
/* Given reduced dependence graph RDG merge strong connected components
of PARTITIONS. In this function, data dependence caused by possible
alias between references is ignored, as if it doesn't exist at all. */
of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
possible alias between references is ignored, as if it doesn't exist
at all; otherwise all depdendences are considered. */
static void
merge_dep_scc_partitions (struct graph *rdg,
vec<struct partition *> *partitions)
vec<struct partition *> *partitions,
bool ignore_alias_p)
{
struct partition *partition1, *partition2;
struct pg_vdata *data;
graph *pg = build_partition_graph (rdg, partitions, true);
graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
int i, j, num_sccs = graphds_scc (pg, NULL);
/* Strong connected compoenent means dependence cycle, we cannot distribute
@ -2340,38 +2365,49 @@ version_for_distribution_p (vec<struct partition *> *partitions,
return (alias_ddrs->length () > 0);
}
/* Fuse all partitions if necessary before finalizing distribution. */
/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
ALIAS_DDRS contains ddrs which need runtime alias check. */
static void
finalize_partitions (vec<struct partition *> *partitions,
finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
vec<ddr_p> *alias_ddrs)
{
unsigned i;
struct partition *a, *partition;
struct partition *partition, *a;
if (partitions->length () == 1
|| alias_ddrs->length () > 0)
return;
a = (*partitions)[0];
if (a->kind != PKIND_NORMAL)
return;
for (i = 1; partitions->iterate (i, &partition); ++i)
unsigned num_builtin = 0, num_normal = 0;
bool same_type_p = true;
enum partition_type type = ((*partitions)[0])->type;
for (i = 0; partitions->iterate (i, &partition); ++i)
{
/* Don't fuse if partition has different type or it is a builtin. */
if (partition->type != a->type
|| partition->kind != PKIND_NORMAL)
return;
same_type_p &= (type == partition->type);
if (partition->kind != PKIND_NORMAL)
num_builtin++;
else
num_normal++;
}
/* Fuse all partitions. */
for (i = 1; partitions->iterate (i, &partition); ++i)
/* Don't distribute current loop into too many loops given we don't have
memory stream cost model. Be even more conservative in case of loop
nest distribution. */
if ((same_type_p && num_builtin == 0)
|| (loop->inner != NULL
&& i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
|| (loop->inner == NULL
&& i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
{
partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
partition_free (partition);
a = (*partitions)[0];
for (i = 1; partitions->iterate (i, &partition); ++i)
{
partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
partition_free (partition);
}
partitions->truncate (1);
}
partitions->truncate (1);
}
/* Distributes the code from LOOP in such a way that producer statements
@ -2524,16 +2560,23 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
i--;
}
/* Build the partition dependency graph. */
/* Build the partition dependency graph and fuse partitions in strong
connected component. */
if (partitions.length () > 1)
{
merge_dep_scc_partitions (rdg, &partitions);
alias_ddrs.truncate (0);
if (partitions.length () > 1)
break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
/* Don't support loop nest distribution under runtime alias check
since it's not likely to enable many vectorization opportunities. */
if (loop->inner)
merge_dep_scc_partitions (rdg, &partitions, false);
else
{
merge_dep_scc_partitions (rdg, &partitions, true);
if (partitions.length () > 1)
break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
}
}
finalize_partitions (&partitions, &alias_ddrs);
finalize_partitions (loop, &partitions, &alias_ddrs);
nbp = partitions.length ();
if (nbp == 0
@ -2614,6 +2657,86 @@ public:
}; // class pass_loop_distribution
/* Given LOOP, this function records seed statements for distribution in
WORK_LIST. Return false if there is nothing for distribution. */
static bool
find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
{
basic_block *bbs = get_loop_body_in_dom_order (loop);
/* Initialize the worklist with stmts we seed the partitions with. */
for (unsigned i = 0; i < loop->num_nodes; ++i)
{
for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
!gsi_end_p (gsi); gsi_next (&gsi))
{
gphi *phi = gsi.phi ();
if (virtual_operand_p (gimple_phi_result (phi)))
continue;
/* Distribute stmts which have defs that are used outside of
the loop. */
if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
continue;
work_list->safe_push (phi);
}
for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
!gsi_end_p (gsi); gsi_next (&gsi))
{
gimple *stmt = gsi_stmt (gsi);
/* If there is a stmt with side-effects bail out - we
cannot and should not distribute this loop. */
if (gimple_has_side_effects (stmt))
{
free (bbs);
return false;
}
/* Distribute stmts which have defs that are used outside of
the loop. */
if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
;
/* Otherwise only distribute stores for now. */
else if (!gimple_vdef (stmt))
continue;
work_list->safe_push (stmt);
}
}
free (bbs);
return work_list->length () > 0;
}
/* Given innermost LOOP, return the outermost enclosing loop that forms a
perfect loop nest. */
static struct loop *
prepare_perfect_loop_nest (struct loop *loop)
{
struct loop *outer = loop_outer (loop);
tree niters = number_of_latch_executions (loop);
/* TODO: We only support the innermost 2-level loop nest distribution
because of compilation time issue for now. This should be relaxed
in the future. */
while (loop->inner == NULL
&& loop_outer (outer)
&& outer->inner == loop && loop->next == NULL
&& single_exit (outer)
&& optimize_loop_for_speed_p (outer)
&& !chrec_contains_symbols_defined_in_loop (niters, outer->num)
&& (niters = number_of_latch_executions (outer)) != NULL_TREE
&& niters != chrec_dont_know)
{
loop = outer;
outer = loop_outer (loop);
}
return loop;
}
unsigned int
pass_loop_distribution::execute (function *fun)
{
@ -2656,18 +2779,9 @@ pass_loop_distribution::execute (function *fun)
walking to innermost loops. */
FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
{
auto_vec<gimple *> work_list;
basic_block *bbs;
int num = loop->num;
unsigned int i;
/* If the loop doesn't have a single exit we will fail anyway,
so do that early. */
if (!single_exit (loop))
continue;
/* Only optimize hot loops. */
if (!optimize_loop_for_speed_p (loop))
/* Don't distribute multiple exit edges loop, or cold loop. */
if (!single_exit (loop)
|| !optimize_loop_for_speed_p (loop))
continue;
/* Don't distribute loop if niters is unknown. */
@ -2675,56 +2789,16 @@ pass_loop_distribution::execute (function *fun)
if (niters == NULL_TREE || niters == chrec_dont_know)
continue;
/* Initialize the worklist with stmts we seed the partitions with. */
bbs = get_loop_body_in_dom_order (loop);
for (i = 0; i < loop->num_nodes; ++i)
/* Get the perfect loop nest for distribution. */
loop = prepare_perfect_loop_nest (loop);
for (; loop; loop = loop->inner)
{
for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
!gsi_end_p (gsi);
gsi_next (&gsi))
{
gphi *phi = gsi.phi ();
if (virtual_operand_p (gimple_phi_result (phi)))
continue;
/* Distribute stmts which have defs that are used outside of
the loop. */
if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
continue;
work_list.safe_push (phi);
}
for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
!gsi_end_p (gsi);
gsi_next (&gsi))
{
gimple *stmt = gsi_stmt (gsi);
auto_vec<gimple *> work_list;
if (!find_seed_stmts_for_distribution (loop, &work_list))
break;
/* If there is a stmt with side-effects bail out - we
cannot and should not distribute this loop. */
if (gimple_has_side_effects (stmt))
{
work_list.truncate (0);
goto out;
}
/* Distribute stmts which have defs that are used outside of
the loop. */
if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
;
/* Otherwise only distribute stores for now. */
else if (!gimple_vdef (stmt))
continue;
work_list.safe_push (stmt);
}
}
out:
free (bbs);
int nb_generated_loops = 0;
int nb_generated_calls = 0;
location_t loc = find_loop_location (loop);
if (work_list.length () > 0)
{
const char *str = loop->inner ? " nest" : "";
location_t loc = find_loop_location (loop);
if (!cd)
{
calculate_dominance_info (CDI_DOMINATORS);
@ -2732,24 +2806,29 @@ out:
cd = new control_dependences ();
free_dominance_info (CDI_POST_DOMINATORS);
}
bool destroy_p;
int nb_generated_loops, nb_generated_calls;
nb_generated_loops = distribute_loop (loop, work_list, cd,
&nb_generated_calls,
&destroy_p);
if (destroy_p)
loops_to_be_destroyed.safe_push (loop);
}
if (nb_generated_loops + nb_generated_calls > 0)
{
changed = true;
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
loc, "Loop %d distributed: split to %d loops "
"and %d library calls.\n",
num, nb_generated_loops, nb_generated_calls);
if (nb_generated_loops + nb_generated_calls > 0)
{
changed = true;
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
loc, "Loop%s %d distributed: split to %d loops "
"and %d library calls.\n", str, loop->num,
nb_generated_loops, nb_generated_calls);
break;
}
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
}
else if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Loop %d is the same.\n", num);
}
if (cd)