From 5955438a2ea81d9fb97d87a913315062ac47672b Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Tue, 23 Jan 2018 16:47:03 +0000 Subject: [PATCH] re PR tree-optimization/82604 (SPEC CPU2006 410.bwaves ~50% performance regression with trunk@253679 when ftree-parallelize-loops is used) PR tree-optimization/82604 * tree-loop-distribution.c (enum partition_kind): New enum item PKIND_PARTIAL_MEMSET. (partition_builtin_p): Support above new enum item. (generate_code_for_partition): Ditto. (compute_access_range): Differentiate cases that equality can be proven at all loops, the innermost loops or no loops. (classify_builtin_st, classify_builtin_ldst): Adjust call to above function. Set PKIND_PARTIAL_MEMSET for partition appropriately. (finalize_partitions, distribute_loop): Don't fuse partition of PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest. (prepare_perfect_loop_nest): Distribute 3-level loop nest only if parloop is enabled. From-SVN: r256990 --- gcc/ChangeLog | 16 ++++++ gcc/tree-loop-distribution.c | 100 +++++++++++++++++++++++++---------- 2 files changed, 87 insertions(+), 29 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3238e70f63e..576d5a21dfd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2018-01-23 Bin Cheng + + PR tree-optimization/82604 + * tree-loop-distribution.c (enum partition_kind): New enum item + PKIND_PARTIAL_MEMSET. + (partition_builtin_p): Support above new enum item. + (generate_code_for_partition): Ditto. + (compute_access_range): Differentiate cases that equality can be + proven at all loops, the innermost loops or no loops. + (classify_builtin_st, classify_builtin_ldst): Adjust call to above + function. Set PKIND_PARTIAL_MEMSET for partition appropriately. + (finalize_partitions, distribute_loop): Don't fuse partition of + PKIND_PARTIAL_MEMSET kind when distributing 3-level loop nest. + (prepare_perfect_loop_nest): Distribute 3-level loop nest only if + parloop is enabled. + 2018-01-23 Martin Liska * predict.def (PRED_INDIR_CALL): Set probability to PROB_EVEN in diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index a3d76e41958..67f27bad83c 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -584,7 +584,19 @@ build_rdg (struct loop *loop, control_dependences *cd) /* Kind of distributed loop. */ enum partition_kind { - PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE + PKIND_NORMAL, + /* Partial memset stands for a paritition can be distributed into a loop + of memset calls, rather than a single memset call. It's handled just + like a normal parition, i.e, distributed as separate loop, no memset + call is generated. + + Note: This is a hacking fix trying to distribute ZERO-ing stmt in a + loop nest as deep as possible. As a result, parloop achieves better + parallelization by parallelizing deeper loop nest. This hack should + be unnecessary and removed once distributed memset can be understood + and analyzed in data reference analysis. See PR82604 for more. */ + PKIND_PARTIAL_MEMSET, + PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE }; /* Type of distributed loop. */ @@ -659,7 +671,7 @@ partition_free (partition *partition) static bool partition_builtin_p (partition *partition) { - return partition->kind != PKIND_NORMAL; + return partition->kind > PKIND_PARTIAL_MEMSET; } /* Returns true if the partition contains a reduction. */ @@ -1127,6 +1139,7 @@ generate_code_for_partition (struct loop *loop, switch (partition->kind) { case PKIND_NORMAL: + case PKIND_PARTIAL_MEMSET: /* Reductions all have to be in the last partition. */ gcc_assert (!partition_reduction_p (partition) || !copy_p); @@ -1399,17 +1412,22 @@ find_single_drs (struct loop *loop, struct graph *rdg, partition *partition, /* Given data reference DR in LOOP_NEST, this function checks the enclosing loops from inner to outer to see if loop's step equals to access size at - each level of loop. Return true if yes; record access base and size in - BASE and SIZE; save loop's step at each level of loop in STEPS if it is - not null. For example: + each level of loop. Return 2 if we can prove this at all level loops; + record access base and size in BASE and SIZE; save loop's step at each + level of loop in STEPS if it is not null. For example: int arr[100][100][100]; for (i = 0; i < 100; i++) ;steps[2] = 40000 for (j = 100; j > 0; j--) ;steps[1] = -400 for (k = 0; k < 100; k++) ;steps[0] = 4 - arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */ + arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 -static bool + Return 1 if we can prove the equality at the innermost loop, but not all + level loops. In this case, no information is recorded. + + Return 0 if no equality can be proven at any level loops. */ + +static int compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, tree *size, vec *steps = NULL) { @@ -1419,24 +1437,25 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, tree ref = DR_REF (dr); tree access_base = build_fold_addr_expr (ref); tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); + int res = 0; do { tree scev_fn = analyze_scalar_evolution (loop, access_base); if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) - return false; + return res; access_base = CHREC_LEFT (scev_fn); if (tree_contains_chrecs (access_base, NULL)) - return false; + return res; tree scev_step = CHREC_RIGHT (scev_fn); /* Only support constant steps. */ if (TREE_CODE (scev_step) != INTEGER_CST) - return false; + return res; enum ev_direction access_dir = scev_direction (scev_fn); if (access_dir == EV_DIR_UNKNOWN) - return false; + return res; if (steps != NULL) steps->safe_push (scev_step); @@ -1449,7 +1468,11 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, /* At each level of loop, scev step must equal to access size. In other words, DR must access consecutive memory between loop iterations. */ if (!operand_equal_p (scev_step, access_size, 0)) - return false; + return res; + + /* Access stride can be computed for data reference at least for the + innermost loop. */ + res = 1; /* Compute DR's execution times in loop. */ tree niters = number_of_latch_executions (loop); @@ -1471,7 +1494,8 @@ compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, *base = access_base; *size = access_size; - return true; + /* Access stride can be computed for data reference at each level loop. */ + return 2; } /* Allocate and return builtin struct. Record information like DST_DR, @@ -1510,8 +1534,14 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) return; - if (!compute_access_range (loop, dr, &base, &size)) + int res = compute_access_range (loop, dr, &base, &size); + if (res == 0) return; + if (res == 1) + { + partition->kind = PKIND_PARTIAL_MEMSET; + return; + } poly_uint64 base_offset; unsigned HOST_WIDE_INT const_base_offset; @@ -1537,11 +1567,16 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, tree base, size, src_base, src_size; auto_vec dst_steps, src_steps; - /* Compute access range of both load and store. They much have the same - access size. */ - if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps) - || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps) - || !operand_equal_p (size, src_size, 0)) + /* Compute access range of both load and store. */ + int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); + if (res != 2) + return; + res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); + if (res != 2) + return; + + /* They much have the same access size. */ + if (!operand_equal_p (size, src_size, 0)) return; /* Load and store in loop nest must access memory in the same way, i.e, @@ -2623,22 +2658,27 @@ finalize_partitions (struct loop *loop, vec *partitions, || alias_ddrs->length () > 0) return; - unsigned num_builtin = 0, num_normal = 0; + unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; bool same_type_p = true; enum partition_type type = ((*partitions)[0])->type; for (i = 0; partitions->iterate (i, &partition); ++i) { same_type_p &= (type == partition->type); - if (partition->kind != PKIND_NORMAL) - num_builtin++; - else - num_normal++; + if (partition_builtin_p (partition)) + { + num_builtin++; + continue; + } + num_normal++; + if (partition->kind == PKIND_PARTIAL_MEMSET) + num_partial_memset++; } /* Don't distribute current loop into too many loops given we don't have memory stream cost model. Be even more conservative in case of loop nest distribution. */ - if ((same_type_p && num_builtin == 0) + if ((same_type_p && num_builtin == 0 + && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) || (loop->inner != NULL && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) || (loop->inner == NULL @@ -2786,7 +2826,7 @@ distribute_loop (struct loop *loop, vec stmts, for (i = 0; partitions.iterate (i, &into); ++i) { bool changed = false; - if (partition_builtin_p (into)) + if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) continue; for (int j = i + 1; partitions.iterate (j, &partition); ++j) @@ -2966,10 +3006,12 @@ prepare_perfect_loop_nest (struct loop *loop) struct loop *outer = loop_outer (loop); tree niters = number_of_latch_executions (loop); - /* TODO: We only support the innermost 2-level loop nest distribution + /* TODO: We only support the innermost 3-level loop nest distribution because of compilation time issue for now. This should be relaxed - in the future. */ - while (loop->inner == NULL + in the future. Note we only allow 3-level loop nest distribution + when parallelizing loops. */ + while ((loop->inner == NULL + || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) && loop_outer (outer) && outer->inner == loop && loop->next == NULL && single_exit (outer)