re PR tree-optimization/82604 (SPEC CPU2006 410.bwaves ~50% performance regression with trunk@253679 when ftree-parallelize-loops is used)

PR tree-optimization/82604
	* tree-loop-distribution.c (enum partition_kind): New enum item
	PKIND_PARTIAL_MEMSET.
	(partition_builtin_p): Support above new enum item.
	(generate_code_for_partition): Ditto.
	(compute_access_range): Differentiate cases that equality can be
	proven at all loops, the innermost loops or no loops.
	(classify_builtin_st, classify_builtin_ldst): Adjust call to above
	function.  Set PKIND_PARTIAL_MEMSET for partition appropriately.
	(finalize_partitions, distribute_loop): Don't fuse partition of
	PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest.
	(prepare_perfect_loop_nest): Distribute 3-level loop nest only if
	parloop is enabled.

From-SVN: r256990
This commit is contained in:
Bin Cheng 2018-01-23 16:47:03 +00:00 committed by Bin Cheng
parent cde9b2f815
commit 5955438a2e
2 changed files with 87 additions and 29 deletions

View File

@ -1,3 +1,19 @@
2018-01-23 Bin Cheng <bin.cheng@arm.com>
PR tree-optimization/82604
* tree-loop-distribution.c (enum partition_kind): New enum item
PKIND_PARTIAL_MEMSET.
(partition_builtin_p): Support above new enum item.
(generate_code_for_partition): Ditto.
(compute_access_range): Differentiate cases that equality can be
proven at all loops, the innermost loops or no loops.
(classify_builtin_st, classify_builtin_ldst): Adjust call to above
function. Set PKIND_PARTIAL_MEMSET for partition appropriately.
(finalize_partitions, distribute_loop): Don't fuse partition of
PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest.
(prepare_perfect_loop_nest): Distribute 3-level loop nest only if
parloop is enabled.
2018-01-23 Martin Liska <mliska@suse.cz>
* predict.def (PRED_INDIR_CALL): Set probability to PROB_EVEN in

View File

@ -584,7 +584,19 @@ build_rdg (struct loop *loop, control_dependences *cd)
/* Kind of distributed loop. */
enum partition_kind {
PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
PKIND_NORMAL,
/* Partial memset stands for a paritition can be distributed into a loop
of memset calls, rather than a single memset call. It's handled just
like a normal parition, i.e, distributed as separate loop, no memset
call is generated.
Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
loop nest as deep as possible. As a result, parloop achieves better
parallelization by parallelizing deeper loop nest. This hack should
be unnecessary and removed once distributed memset can be understood
and analyzed in data reference analysis. See PR82604 for more. */
PKIND_PARTIAL_MEMSET,
PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
};
/* Type of distributed loop. */
@ -659,7 +671,7 @@ partition_free (partition *partition)
static bool
partition_builtin_p (partition *partition)
{
return partition->kind != PKIND_NORMAL;
return partition->kind > PKIND_PARTIAL_MEMSET;
}
/* Returns true if the partition contains a reduction. */
@ -1127,6 +1139,7 @@ generate_code_for_partition (struct loop *loop,
switch (partition->kind)
{
case PKIND_NORMAL:
case PKIND_PARTIAL_MEMSET:
/* Reductions all have to be in the last partition. */
gcc_assert (!partition_reduction_p (partition)
|| !copy_p);
@ -1399,17 +1412,22 @@ find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
/* Given data reference DR in LOOP_NEST, this function checks the enclosing
loops from inner to outer to see if loop's step equals to access size at
each level of loop. Return true if yes; record access base and size in
BASE and SIZE; save loop's step at each level of loop in STEPS if it is
not null. For example:
each level of loop. Return 2 if we can prove this at all level loops;
record access base and size in BASE and SIZE; save loop's step at each
level of loop in STEPS if it is not null. For example:
int arr[100][100][100];
for (i = 0; i < 100; i++) ;steps[2] = 40000
for (j = 100; j > 0; j--) ;steps[1] = -400
for (k = 0; k < 100; k++) ;steps[0] = 4
arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */
arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
static bool
Return 1 if we can prove the equality at the innermost loop, but not all
level loops. In this case, no information is recorded.
Return 0 if no equality can be proven at any level loops. */
static int
compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
tree *size, vec<tree> *steps = NULL)
{
@ -1419,24 +1437,25 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
tree ref = DR_REF (dr);
tree access_base = build_fold_addr_expr (ref);
tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
int res = 0;
do {
tree scev_fn = analyze_scalar_evolution (loop, access_base);
if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
return false;
return res;
access_base = CHREC_LEFT (scev_fn);
if (tree_contains_chrecs (access_base, NULL))
return false;
return res;
tree scev_step = CHREC_RIGHT (scev_fn);
/* Only support constant steps. */
if (TREE_CODE (scev_step) != INTEGER_CST)
return false;
return res;
enum ev_direction access_dir = scev_direction (scev_fn);
if (access_dir == EV_DIR_UNKNOWN)
return false;
return res;
if (steps != NULL)
steps->safe_push (scev_step);
@ -1449,7 +1468,11 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
/* At each level of loop, scev step must equal to access size. In other
words, DR must access consecutive memory between loop iterations. */
if (!operand_equal_p (scev_step, access_size, 0))
return false;
return res;
/* Access stride can be computed for data reference at least for the
innermost loop. */
res = 1;
/* Compute DR's execution times in loop. */
tree niters = number_of_latch_executions (loop);
@ -1471,7 +1494,8 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
*base = access_base;
*size = access_size;
return true;
/* Access stride can be computed for data reference at each level loop. */
return 2;
}
/* Allocate and return builtin struct. Record information like DST_DR,
@ -1510,8 +1534,14 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
&& flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
return;
if (!compute_access_range (loop, dr, &base, &size))
int res = compute_access_range (loop, dr, &base, &size);
if (res == 0)
return;
if (res == 1)
{
partition->kind = PKIND_PARTIAL_MEMSET;
return;
}
poly_uint64 base_offset;
unsigned HOST_WIDE_INT const_base_offset;
@ -1537,11 +1567,16 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
tree base, size, src_base, src_size;
auto_vec<tree> dst_steps, src_steps;
/* Compute access range of both load and store. They much have the same
access size. */
if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
|| !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
|| !operand_equal_p (size, src_size, 0))
/* Compute access range of both load and store. */
int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
if (res != 2)
return;
res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
if (res != 2)
return;
/* They much have the same access size. */
if (!operand_equal_p (size, src_size, 0))
return;
/* Load and store in loop nest must access memory in the same way, i.e,
@ -2623,22 +2658,27 @@ finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
|| alias_ddrs->length () > 0)
return;
unsigned num_builtin = 0, num_normal = 0;
unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
bool same_type_p = true;
enum partition_type type = ((*partitions)[0])->type;
for (i = 0; partitions->iterate (i, &partition); ++i)
{
same_type_p &= (type == partition->type);
if (partition->kind != PKIND_NORMAL)
num_builtin++;
else
num_normal++;
if (partition_builtin_p (partition))
{
num_builtin++;
continue;
}
num_normal++;
if (partition->kind == PKIND_PARTIAL_MEMSET)
num_partial_memset++;
}
/* Don't distribute current loop into too many loops given we don't have
memory stream cost model. Be even more conservative in case of loop
nest distribution. */
if ((same_type_p && num_builtin == 0)
if ((same_type_p && num_builtin == 0
&& (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
|| (loop->inner != NULL
&& i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
|| (loop->inner == NULL
@ -2786,7 +2826,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
for (i = 0; partitions.iterate (i, &into); ++i)
{
bool changed = false;
if (partition_builtin_p (into))
if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
continue;
for (int j = i + 1;
partitions.iterate (j, &partition); ++j)
@ -2966,10 +3006,12 @@ prepare_perfect_loop_nest (struct loop *loop)
struct loop *outer = loop_outer (loop);
tree niters = number_of_latch_executions (loop);
/* TODO: We only support the innermost 2-level loop nest distribution
/* TODO: We only support the innermost 3-level loop nest distribution
because of compilation time issue for now. This should be relaxed
in the future. */
while (loop->inner == NULL
in the future. Note we only allow 3-level loop nest distribution
when parallelizing loops. */
while ((loop->inner == NULL
|| (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
&& loop_outer (outer)
&& outer->inner == loop && loop->next == NULL
&& single_exit (outer)