re PR tree-optimization/82604 (SPEC CPU2006 410.bwaves ~50% performance regression with trunk@253679 when ftree-parallelize-loops is used)
PR tree-optimization/82604 * tree-loop-distribution.c (enum partition_kind): New enum item PKIND_PARTIAL_MEMSET. (partition_builtin_p): Support above new enum item. (generate_code_for_partition): Ditto. (compute_access_range): Differentiate cases that equality can be proven at all loops, the innermost loops or no loops. (classify_builtin_st, classify_builtin_ldst): Adjust call to above function. Set PKIND_PARTIAL_MEMSET for partition appropriately. (finalize_partitions, distribute_loop): Don't fuse partition of PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest. (prepare_perfect_loop_nest): Distribute 3-level loop nest only if parloop is enabled. From-SVN: r256990
This commit is contained in:
parent
cde9b2f815
commit
5955438a2e
@ -1,3 +1,19 @@
|
||||
2018-01-23 Bin Cheng <bin.cheng@arm.com>
|
||||
|
||||
PR tree-optimization/82604
|
||||
* tree-loop-distribution.c (enum partition_kind): New enum item
|
||||
PKIND_PARTIAL_MEMSET.
|
||||
(partition_builtin_p): Support above new enum item.
|
||||
(generate_code_for_partition): Ditto.
|
||||
(compute_access_range): Differentiate cases that equality can be
|
||||
proven at all loops, the innermost loops or no loops.
|
||||
(classify_builtin_st, classify_builtin_ldst): Adjust call to above
|
||||
function. Set PKIND_PARTIAL_MEMSET for partition appropriately.
|
||||
(finalize_partitions, distribute_loop): Don't fuse partition of
|
||||
PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest.
|
||||
(prepare_perfect_loop_nest): Distribute 3-level loop nest only if
|
||||
parloop is enabled.
|
||||
|
||||
2018-01-23 Martin Liska <mliska@suse.cz>
|
||||
|
||||
* predict.def (PRED_INDIR_CALL): Set probability to PROB_EVEN in
|
||||
|
@ -584,7 +584,19 @@ build_rdg (struct loop *loop, control_dependences *cd)
|
||||
|
||||
/* Kind of distributed loop. */
|
||||
enum partition_kind {
|
||||
PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
|
||||
PKIND_NORMAL,
|
||||
/* Partial memset stands for a paritition can be distributed into a loop
|
||||
of memset calls, rather than a single memset call. It's handled just
|
||||
like a normal parition, i.e, distributed as separate loop, no memset
|
||||
call is generated.
|
||||
|
||||
Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
|
||||
loop nest as deep as possible. As a result, parloop achieves better
|
||||
parallelization by parallelizing deeper loop nest. This hack should
|
||||
be unnecessary and removed once distributed memset can be understood
|
||||
and analyzed in data reference analysis. See PR82604 for more. */
|
||||
PKIND_PARTIAL_MEMSET,
|
||||
PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
|
||||
};
|
||||
|
||||
/* Type of distributed loop. */
|
||||
@ -659,7 +671,7 @@ partition_free (partition *partition)
|
||||
static bool
|
||||
partition_builtin_p (partition *partition)
|
||||
{
|
||||
return partition->kind != PKIND_NORMAL;
|
||||
return partition->kind > PKIND_PARTIAL_MEMSET;
|
||||
}
|
||||
|
||||
/* Returns true if the partition contains a reduction. */
|
||||
@ -1127,6 +1139,7 @@ generate_code_for_partition (struct loop *loop,
|
||||
switch (partition->kind)
|
||||
{
|
||||
case PKIND_NORMAL:
|
||||
case PKIND_PARTIAL_MEMSET:
|
||||
/* Reductions all have to be in the last partition. */
|
||||
gcc_assert (!partition_reduction_p (partition)
|
||||
|| !copy_p);
|
||||
@ -1399,17 +1412,22 @@ find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
|
||||
|
||||
/* Given data reference DR in LOOP_NEST, this function checks the enclosing
|
||||
loops from inner to outer to see if loop's step equals to access size at
|
||||
each level of loop. Return true if yes; record access base and size in
|
||||
BASE and SIZE; save loop's step at each level of loop in STEPS if it is
|
||||
not null. For example:
|
||||
each level of loop. Return 2 if we can prove this at all level loops;
|
||||
record access base and size in BASE and SIZE; save loop's step at each
|
||||
level of loop in STEPS if it is not null. For example:
|
||||
|
||||
int arr[100][100][100];
|
||||
for (i = 0; i < 100; i++) ;steps[2] = 40000
|
||||
for (j = 100; j > 0; j--) ;steps[1] = -400
|
||||
for (k = 0; k < 100; k++) ;steps[0] = 4
|
||||
arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */
|
||||
arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
|
||||
|
||||
static bool
|
||||
Return 1 if we can prove the equality at the innermost loop, but not all
|
||||
level loops. In this case, no information is recorded.
|
||||
|
||||
Return 0 if no equality can be proven at any level loops. */
|
||||
|
||||
static int
|
||||
compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
|
||||
tree *size, vec<tree> *steps = NULL)
|
||||
{
|
||||
@ -1419,24 +1437,25 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
|
||||
tree ref = DR_REF (dr);
|
||||
tree access_base = build_fold_addr_expr (ref);
|
||||
tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
|
||||
int res = 0;
|
||||
|
||||
do {
|
||||
tree scev_fn = analyze_scalar_evolution (loop, access_base);
|
||||
if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
|
||||
return false;
|
||||
return res;
|
||||
|
||||
access_base = CHREC_LEFT (scev_fn);
|
||||
if (tree_contains_chrecs (access_base, NULL))
|
||||
return false;
|
||||
return res;
|
||||
|
||||
tree scev_step = CHREC_RIGHT (scev_fn);
|
||||
/* Only support constant steps. */
|
||||
if (TREE_CODE (scev_step) != INTEGER_CST)
|
||||
return false;
|
||||
return res;
|
||||
|
||||
enum ev_direction access_dir = scev_direction (scev_fn);
|
||||
if (access_dir == EV_DIR_UNKNOWN)
|
||||
return false;
|
||||
return res;
|
||||
|
||||
if (steps != NULL)
|
||||
steps->safe_push (scev_step);
|
||||
@ -1449,7 +1468,11 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
|
||||
/* At each level of loop, scev step must equal to access size. In other
|
||||
words, DR must access consecutive memory between loop iterations. */
|
||||
if (!operand_equal_p (scev_step, access_size, 0))
|
||||
return false;
|
||||
return res;
|
||||
|
||||
/* Access stride can be computed for data reference at least for the
|
||||
innermost loop. */
|
||||
res = 1;
|
||||
|
||||
/* Compute DR's execution times in loop. */
|
||||
tree niters = number_of_latch_executions (loop);
|
||||
@ -1471,7 +1494,8 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
|
||||
|
||||
*base = access_base;
|
||||
*size = access_size;
|
||||
return true;
|
||||
/* Access stride can be computed for data reference at each level loop. */
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* Allocate and return builtin struct. Record information like DST_DR,
|
||||
@ -1510,8 +1534,14 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
|
||||
&& flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
|
||||
return;
|
||||
|
||||
if (!compute_access_range (loop, dr, &base, &size))
|
||||
int res = compute_access_range (loop, dr, &base, &size);
|
||||
if (res == 0)
|
||||
return;
|
||||
if (res == 1)
|
||||
{
|
||||
partition->kind = PKIND_PARTIAL_MEMSET;
|
||||
return;
|
||||
}
|
||||
|
||||
poly_uint64 base_offset;
|
||||
unsigned HOST_WIDE_INT const_base_offset;
|
||||
@ -1537,11 +1567,16 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
|
||||
tree base, size, src_base, src_size;
|
||||
auto_vec<tree> dst_steps, src_steps;
|
||||
|
||||
/* Compute access range of both load and store. They much have the same
|
||||
access size. */
|
||||
if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
|
||||
|| !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
|
||||
|| !operand_equal_p (size, src_size, 0))
|
||||
/* Compute access range of both load and store. */
|
||||
int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
|
||||
if (res != 2)
|
||||
return;
|
||||
res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
|
||||
if (res != 2)
|
||||
return;
|
||||
|
||||
/* They much have the same access size. */
|
||||
if (!operand_equal_p (size, src_size, 0))
|
||||
return;
|
||||
|
||||
/* Load and store in loop nest must access memory in the same way, i.e,
|
||||
@ -2623,22 +2658,27 @@ finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
|
||||
|| alias_ddrs->length () > 0)
|
||||
return;
|
||||
|
||||
unsigned num_builtin = 0, num_normal = 0;
|
||||
unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
|
||||
bool same_type_p = true;
|
||||
enum partition_type type = ((*partitions)[0])->type;
|
||||
for (i = 0; partitions->iterate (i, &partition); ++i)
|
||||
{
|
||||
same_type_p &= (type == partition->type);
|
||||
if (partition->kind != PKIND_NORMAL)
|
||||
num_builtin++;
|
||||
else
|
||||
num_normal++;
|
||||
if (partition_builtin_p (partition))
|
||||
{
|
||||
num_builtin++;
|
||||
continue;
|
||||
}
|
||||
num_normal++;
|
||||
if (partition->kind == PKIND_PARTIAL_MEMSET)
|
||||
num_partial_memset++;
|
||||
}
|
||||
|
||||
/* Don't distribute current loop into too many loops given we don't have
|
||||
memory stream cost model. Be even more conservative in case of loop
|
||||
nest distribution. */
|
||||
if ((same_type_p && num_builtin == 0)
|
||||
if ((same_type_p && num_builtin == 0
|
||||
&& (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
|
||||
|| (loop->inner != NULL
|
||||
&& i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
|
||||
|| (loop->inner == NULL
|
||||
@ -2786,7 +2826,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
|
||||
for (i = 0; partitions.iterate (i, &into); ++i)
|
||||
{
|
||||
bool changed = false;
|
||||
if (partition_builtin_p (into))
|
||||
if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
|
||||
continue;
|
||||
for (int j = i + 1;
|
||||
partitions.iterate (j, &partition); ++j)
|
||||
@ -2966,10 +3006,12 @@ prepare_perfect_loop_nest (struct loop *loop)
|
||||
struct loop *outer = loop_outer (loop);
|
||||
tree niters = number_of_latch_executions (loop);
|
||||
|
||||
/* TODO: We only support the innermost 2-level loop nest distribution
|
||||
/* TODO: We only support the innermost 3-level loop nest distribution
|
||||
because of compilation time issue for now. This should be relaxed
|
||||
in the future. */
|
||||
while (loop->inner == NULL
|
||||
in the future. Note we only allow 3-level loop nest distribution
|
||||
when parallelizing loops. */
|
||||
while ((loop->inner == NULL
|
||||
|| (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
|
||||
&& loop_outer (outer)
|
||||
&& outer->inner == loop && loop->next == NULL
|
||||
&& single_exit (outer)
|
||||
|
Loading…
Reference in New Issue
Block a user