From 163aa51b706a50c86b639d7827803550706bef78 Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Thu, 12 Oct 2017 14:29:45 +0000 Subject: [PATCH] tree-loop-distribution.c: Adjust the general comment. * tree-loop-distribution.c: Adjust the general comment. (NUM_PARTITION_THRESHOLD): New macro. (ssa_name_has_uses_outside_loop_p): Support loop nest distribution. (classify_partition): Skip builtin pattern of loop nest's inner loop. (merge_dep_scc_partitions): New parameter ignore_alias_p and use it in call to build_partition_graph. (finalize_partitions): New parameter. Make loop distribution more conservative by fusing more partitions. (distribute_loop): Don't do runtime alias check in case of loop nest distribution. (find_seed_stmts_for_distribution): New function. (prepare_perfect_loop_nest): New function. (pass_loop_distribution::execute): Refactor code finding seed stmts and loop nest into above functions. Support loop nest distribution. Adjust dump information accordingly. gcc/testsuite * gcc.dg/tree-ssa/ldist-7.c: Adjust test string. * gcc.dg/tree-ssa/ldist-16.c: Ditto. * gcc.dg/tree-ssa/ldist-25.c: Ditto. * gcc.dg/tree-ssa/ldist-33.c: New test. From-SVN: r253679 --- gcc/ChangeLog | 18 ++ gcc/testsuite/ChangeLog | 7 + gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c | 21 ++ gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c | 2 +- gcc/tree-loop-distribution.c | 283 +++++++++++++++-------- 7 files changed, 230 insertions(+), 105 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d0eb1670720..65eb2023b15 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,21 @@ +2017-10-12 Bin Cheng + + * tree-loop-distribution.c: Adjust the general comment. + (NUM_PARTITION_THRESHOLD): New macro. + (ssa_name_has_uses_outside_loop_p): Support loop nest distribution. + (classify_partition): Skip builtin pattern of loop nest's inner loop. + (merge_dep_scc_partitions): New parameter ignore_alias_p and use it + in call to build_partition_graph. + (finalize_partitions): New parameter. Make loop distribution more + conservative by fusing more partitions. + (distribute_loop): Don't do runtime alias check in case of loop nest + distribution. + (find_seed_stmts_for_distribution): New function. + (prepare_perfect_loop_nest): New function. + (pass_loop_distribution::execute): Refactor code finding seed stmts + and loop nest into above functions. Support loop nest distribution. + Adjust dump information accordingly. + 2017-10-12 Bin Cheng * tree-loop-distribution.c (break_alias_scc_partitions): Add comment diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 29095e75a51..ee792a77081 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2017-10-12 Bin Cheng + + * gcc.dg/tree-ssa/ldist-7.c: Adjust test string. + * gcc.dg/tree-ssa/ldist-16.c: Ditto. + * gcc.dg/tree-ssa/ldist-25.c: Ditto. + * gcc.dg/tree-ssa/ldist-33.c: New test. + 2017-10-12 Richard Biener PR tree-optimization/69728 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c index f43b64ead62..f4f3a44903c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c @@ -16,5 +16,5 @@ void foo (int n) /* We should not apply loop distribution and not generate a memset (0). */ -/* { dg-final { scan-tree-dump "Loop 1 is the same" "ldist" } } */ +/* { dg-final { scan-tree-dump "Loop 1 not distributed" "ldist" } } */ /* { dg-final { scan-tree-dump-times "generated memset zero" 0 "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c index 699bf38ab03..c0b95fc38ec 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c @@ -22,4 +22,4 @@ foo (void) } } -/* { dg-final { scan-tree-dump "Loop . is the same" "ldist" } } */ +/* { dg-final { scan-tree-dump "Loop . not distributed" "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c new file mode 100644 index 00000000000..24d27fde9da --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ + +#define N (1024) +double a[N][N], b[N][N], c[N][N]; + +void +foo (void) +{ + unsigned i, j, k; + + for (i = 0; i < N; ++i) + for (j = 0; j < N; ++j) + { + c[i][j] = 0.0; + for (k = 0; k < N; ++k) + c[i][j] += a[i][k] * b[k][j]; + } +} + +/* { dg-final { scan-tree-dump "Loop nest . distributed: split to 1 loops and 1 library" "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c index f31d051984a..2eb1f74d4ab 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c @@ -28,4 +28,4 @@ int loop1 (int k) return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2]; } -/* { dg-final { scan-tree-dump-times "distributed" 0 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "distributed: " 0 "ldist" } } */ diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 8e8ac7ed531..49ca9ee8cef 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -83,8 +83,8 @@ along with GCC; see the file COPYING3. If not see loops and recover to the original one. TODO: - 1) We only distribute innermost loops now. This pass should handle loop - nests in the future. + 1) We only distribute innermost two-level loop nest now. We should + extend it for arbitrary loop nests in the future. 2) We only fuse partitions in SCC now. A better fusion algorithm is desired to minimize loop overhead, maximize parallelism and maximize data reuse. */ @@ -118,6 +118,11 @@ along with GCC; see the file COPYING3. If not see #define MAX_DATAREFS_NUM \ ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) +/* Threshold controlling number of distributed partitions. Given it may + be unnecessary if a memory stream cost model is invented in the future, + we define it as a temporary macro, rather than a parameter. */ +#define NUM_PARTITION_THRESHOLD (4) + /* Hashtable helpers. */ struct ddr_hasher : nofree_ptr_hash @@ -714,9 +719,11 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) { - gimple *use_stmt = USE_STMT (use_p); - if (!is_gimple_debug (use_stmt) - && loop != loop_containing_stmt (use_stmt)) + if (is_gimple_debug (USE_STMT (use_p))) + continue; + + basic_block use_bb = gimple_bb (USE_STMT (use_p)); + if (!flow_bb_inside_loop_p (loop, use_bb)) return true; } @@ -1414,6 +1421,22 @@ classify_partition (loop_p loop, struct graph *rdg, partition *partition, if (!single_store) return; + /* TODO: We don't support memset/memcpy distribution for loop nest yet. */ + if (loop->inner) + { + basic_block bb = gimple_bb (DR_STMT (single_store)); + + if (bb->loop_father != loop) + return; + + if (single_load) + { + bb = gimple_bb (DR_STMT (single_load)); + if (bb->loop_father != loop) + return; + } + } + nb_iter = number_of_latch_executions (loop); gcc_assert (nb_iter && nb_iter != chrec_dont_know); if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, @@ -1965,16 +1988,18 @@ sort_partitions_by_post_order (struct graph *pg, } /* Given reduced dependence graph RDG merge strong connected components - of PARTITIONS. In this function, data dependence caused by possible - alias between references is ignored, as if it doesn't exist at all. */ + of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by + possible alias between references is ignored, as if it doesn't exist + at all; otherwise all depdendences are considered. */ static void merge_dep_scc_partitions (struct graph *rdg, - vec *partitions) + vec *partitions, + bool ignore_alias_p) { struct partition *partition1, *partition2; struct pg_vdata *data; - graph *pg = build_partition_graph (rdg, partitions, true); + graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); int i, j, num_sccs = graphds_scc (pg, NULL); /* Strong connected compoenent means dependence cycle, we cannot distribute @@ -2340,38 +2365,49 @@ version_for_distribution_p (vec *partitions, return (alias_ddrs->length () > 0); } -/* Fuse all partitions if necessary before finalizing distribution. */ +/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ static void -finalize_partitions (vec *partitions, +finalize_partitions (struct loop *loop, vec *partitions, vec *alias_ddrs) { unsigned i; - struct partition *a, *partition; + struct partition *partition, *a; if (partitions->length () == 1 || alias_ddrs->length () > 0) return; - a = (*partitions)[0]; - if (a->kind != PKIND_NORMAL) - return; - - for (i = 1; partitions->iterate (i, &partition); ++i) + unsigned num_builtin = 0, num_normal = 0; + bool same_type_p = true; + enum partition_type type = ((*partitions)[0])->type; + for (i = 0; partitions->iterate (i, &partition); ++i) { - /* Don't fuse if partition has different type or it is a builtin. */ - if (partition->type != a->type - || partition->kind != PKIND_NORMAL) - return; + same_type_p &= (type == partition->type); + if (partition->kind != PKIND_NORMAL) + num_builtin++; + else + num_normal++; } - /* Fuse all partitions. */ - for (i = 1; partitions->iterate (i, &partition); ++i) + /* Don't distribute current loop into too many loops given we don't have + memory stream cost model. Be even more conservative in case of loop + nest distribution. */ + if ((same_type_p && num_builtin == 0) + || (loop->inner != NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) { - partition_merge_into (NULL, a, partition, FUSE_FINALIZE); - partition_free (partition); + a = (*partitions)[0]; + for (i = 1; partitions->iterate (i, &partition); ++i) + { + partition_merge_into (NULL, a, partition, FUSE_FINALIZE); + partition_free (partition); + } + partitions->truncate (1); } - partitions->truncate (1); } /* Distributes the code from LOOP in such a way that producer statements @@ -2524,16 +2560,23 @@ distribute_loop (struct loop *loop, vec stmts, i--; } - /* Build the partition dependency graph. */ + /* Build the partition dependency graph and fuse partitions in strong + connected component. */ if (partitions.length () > 1) { - merge_dep_scc_partitions (rdg, &partitions); - alias_ddrs.truncate (0); - if (partitions.length () > 1) - break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); + /* Don't support loop nest distribution under runtime alias check + since it's not likely to enable many vectorization opportunities. */ + if (loop->inner) + merge_dep_scc_partitions (rdg, &partitions, false); + else + { + merge_dep_scc_partitions (rdg, &partitions, true); + if (partitions.length () > 1) + break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); + } } - finalize_partitions (&partitions, &alias_ddrs); + finalize_partitions (loop, &partitions, &alias_ddrs); nbp = partitions.length (); if (nbp == 0 @@ -2614,6 +2657,86 @@ public: }; // class pass_loop_distribution + +/* Given LOOP, this function records seed statements for distribution in + WORK_LIST. Return false if there is nothing for distribution. */ + +static bool +find_seed_stmts_for_distribution (struct loop *loop, vec *work_list) +{ + basic_block *bbs = get_loop_body_in_dom_order (loop); + + /* Initialize the worklist with stmts we seed the partitions with. */ + for (unsigned i = 0; i < loop->num_nodes; ++i) + { + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) + continue; + work_list->safe_push (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + /* If there is a stmt with side-effects bail out - we + cannot and should not distribute this loop. */ + if (gimple_has_side_effects (stmt)) + { + free (bbs); + return false; + } + + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + ; + /* Otherwise only distribute stores for now. */ + else if (!gimple_vdef (stmt)) + continue; + + work_list->safe_push (stmt); + } + } + free (bbs); + return work_list->length () > 0; +} + +/* Given innermost LOOP, return the outermost enclosing loop that forms a + perfect loop nest. */ + +static struct loop * +prepare_perfect_loop_nest (struct loop *loop) +{ + struct loop *outer = loop_outer (loop); + tree niters = number_of_latch_executions (loop); + + /* TODO: We only support the innermost 2-level loop nest distribution + because of compilation time issue for now. This should be relaxed + in the future. */ + while (loop->inner == NULL + && loop_outer (outer) + && outer->inner == loop && loop->next == NULL + && single_exit (outer) + && optimize_loop_for_speed_p (outer) + && !chrec_contains_symbols_defined_in_loop (niters, outer->num) + && (niters = number_of_latch_executions (outer)) != NULL_TREE + && niters != chrec_dont_know) + { + loop = outer; + outer = loop_outer (loop); + } + + return loop; +} + unsigned int pass_loop_distribution::execute (function *fun) { @@ -2656,18 +2779,9 @@ pass_loop_distribution::execute (function *fun) walking to innermost loops. */ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) { - auto_vec work_list; - basic_block *bbs; - int num = loop->num; - unsigned int i; - - /* If the loop doesn't have a single exit we will fail anyway, - so do that early. */ - if (!single_exit (loop)) - continue; - - /* Only optimize hot loops. */ - if (!optimize_loop_for_speed_p (loop)) + /* Don't distribute multiple exit edges loop, or cold loop. */ + if (!single_exit (loop) + || !optimize_loop_for_speed_p (loop)) continue; /* Don't distribute loop if niters is unknown. */ @@ -2675,56 +2789,16 @@ pass_loop_distribution::execute (function *fun) if (niters == NULL_TREE || niters == chrec_dont_know) continue; - /* Initialize the worklist with stmts we seed the partitions with. */ - bbs = get_loop_body_in_dom_order (loop); - for (i = 0; i < loop->num_nodes; ++i) + /* Get the perfect loop nest for distribution. */ + loop = prepare_perfect_loop_nest (loop); + for (; loop; loop = loop->inner) { - for (gphi_iterator gsi = gsi_start_phis (bbs[i]); - !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gphi *phi = gsi.phi (); - if (virtual_operand_p (gimple_phi_result (phi))) - continue; - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) - continue; - work_list.safe_push (phi); - } - for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); - !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); + auto_vec work_list; + if (!find_seed_stmts_for_distribution (loop, &work_list)) + break; - /* If there is a stmt with side-effects bail out - we - cannot and should not distribute this loop. */ - if (gimple_has_side_effects (stmt)) - { - work_list.truncate (0); - goto out; - } - - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) - ; - /* Otherwise only distribute stores for now. */ - else if (!gimple_vdef (stmt)) - continue; - - work_list.safe_push (stmt); - } - } -out: - free (bbs); - - int nb_generated_loops = 0; - int nb_generated_calls = 0; - location_t loc = find_loop_location (loop); - if (work_list.length () > 0) - { + const char *str = loop->inner ? " nest" : ""; + location_t loc = find_loop_location (loop); if (!cd) { calculate_dominance_info (CDI_DOMINATORS); @@ -2732,24 +2806,29 @@ out: cd = new control_dependences (); free_dominance_info (CDI_POST_DOMINATORS); } + bool destroy_p; + int nb_generated_loops, nb_generated_calls; nb_generated_loops = distribute_loop (loop, work_list, cd, &nb_generated_calls, &destroy_p); if (destroy_p) loops_to_be_destroyed.safe_push (loop); - } - if (nb_generated_loops + nb_generated_calls > 0) - { - changed = true; - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, - loc, "Loop %d distributed: split to %d loops " - "and %d library calls.\n", - num, nb_generated_loops, nb_generated_calls); + if (nb_generated_loops + nb_generated_calls > 0) + { + changed = true; + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop%s %d distributed: split to %d loops " + "and %d library calls.\n", str, loop->num, + nb_generated_loops, nb_generated_calls); + + break; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); } - else if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Loop %d is the same.\n", num); } if (cd)