Restore correct iv step for fully-masked loops

r272233 introduced a large number of execution failures on SVE.
The patch hard-coded an IV step of VF, but for SLP groups it needs
to be VF * group size.

Also, iv_precision had type widest_int but only needs to be unsigned int.

2019-06-18  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vect-loop-manip.c (vect_set_loop_masks_directly): Remove
	vf parameter.  Restore the previous iv step of nscalars_step,
	but give it iv_type rather than compare_type.  Tweak code order
	to match the comments.
	(vect_set_loop_condition_masked): Update accordingly.
	* tree-vect-loop.c (vect_verify_full_masking): Use "unsigned int"
	for iv_precision.  Tweak comment formatting.

From-SVN: r272411
This commit is contained in:
Richard Sandiford 2019-06-18 09:18:17 +00:00 committed by Richard Sandiford
parent a9e47ccf26
commit fcae0292de
3 changed files with 37 additions and 32 deletions

View File

@ -1,3 +1,13 @@
2019-06-18 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-loop-manip.c (vect_set_loop_masks_directly): Remove
vf parameter. Restore the previous iv step of nscalars_step,
but give it iv_type rather than compare_type. Tweak code order
to match the comments.
(vect_set_loop_condition_masked): Update accordingly.
* tree-vect-loop.c (vect_verify_full_masking): Use "unsigned int"
for iv_precision. Tweak comment formatting.
2019-06-18 Iain Sandoe <iain@sandoe.co.uk>
* config/darwin.c: Strip trailing whitespace.

View File

@ -382,8 +382,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
Use LOOP_COND_GSI to insert code before the exit gcond.
RGM belongs to loop LOOP. The loop originally iterated NITERS
times and has been vectorized according to LOOP_VINFO. Each iteration
of the vectorized loop handles VF iterations of the scalar loop.
times and has been vectorized according to LOOP_VINFO.
If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
starts with NITERS_SKIP dummy iterations of the scalar loop before
@ -410,8 +409,7 @@ static tree
vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
gimple_seq *preheader_seq,
gimple_stmt_iterator loop_cond_gsi,
rgroup_masks *rgm, tree vf,
tree niters, tree niters_skip,
rgroup_masks *rgm, tree niters, tree niters_skip,
bool might_wrap_p)
{
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
@ -419,26 +417,28 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
tree mask_type = rgm->mask_type;
unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
/* Calculate the maximum number of scalar values that the rgroup
handles in total, the number that it handles for each iteration
of the vector loop, and the number that it should skip during the
first iteration of the vector loop. */
tree nscalars_total = niters;
tree nscalars_step = vf;
tree nscalars_step = build_int_cst (iv_type, vf);
tree nscalars_skip = niters_skip;
if (nscalars_per_iter != 1)
{
/* We checked before choosing to use a fully-masked loop that these
multiplications don't overflow. */
tree factor = build_int_cst (compare_type, nscalars_per_iter);
tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nscalars_total, factor);
nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nscalars_step, factor);
nscalars_total, compare_factor);
nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
nscalars_step, iv_factor);
if (nscalars_skip)
nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nscalars_skip, factor);
nscalars_skip, compare_factor);
}
/* Create an induction variable that counts the number of scalars
@ -447,15 +447,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
gimple_stmt_iterator incr_gsi;
bool insert_after;
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
&incr_gsi, insert_after, &index_before_incr, &index_after_incr);
tree zero_index = build_int_cst (iv_type, 0);
tree step = build_int_cst (iv_type,
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
/* Create IV of iv_type. */
create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);
zero_index = build_int_cst (compare_type, 0);
tree zero_index = build_int_cst (compare_type, 0);
tree test_index, test_limit, first_limit;
gimple_stmt_iterator *test_gsi;
if (might_wrap_p)
@ -487,7 +482,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
where the rightmost subtraction can be done directly in
COMPARE_TYPE. */
test_index = index_before_incr;
tree adjust = nscalars_step;
tree adjust = gimple_convert (preheader_seq, compare_type,
nscalars_step);
if (nscalars_skip)
adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
adjust, nscalars_skip);
@ -531,14 +527,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
first_limit = test_limit;
}
/* Provide a definition of each mask in the group. */
tree next_mask = NULL_TREE;
tree mask;
unsigned int i;
/* Convert the IV value to the comparison type (either a no-op or
a demotion). */
gimple_seq test_seq = NULL;
test_index = gimple_convert (&test_seq, compare_type, test_index);
gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
/* Provide a definition of each mask in the group. */
tree next_mask = NULL_TREE;
tree mask;
unsigned int i;
FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
{
/* Previous masks will cover BIAS scalars. This mask covers the
@ -672,9 +670,6 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
niters = gimple_convert (&preheader_seq, compare_type, niters);
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
/* Get the vectorization factor in tree form. */
tree vf = build_int_cst (compare_type,
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
/* Iterate over all the rgroups and fill in their masks. We could use
the first mask from any rgroup for the loop condition; here we
@ -709,7 +704,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
/* Set up all masks for this group. */
test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
&preheader_seq,
loop_cond_gsi, rgm, vf,
loop_cond_gsi, rgm,
niters, niters_skip,
might_wrap_p);
}

View File

@ -1062,7 +1062,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
tree cmp_type = NULL_TREE;
tree iv_type = NULL_TREE;
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
widest_int iv_precision = UINT_MAX;
unsigned int iv_precision = UINT_MAX;
if (iv_limit != -1)
iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
@ -1083,12 +1083,12 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
best choice:
- An IV that's Pmode or wider is more likely to be reusable
in address calculations than an IV that's narrower than
Pmode.
in address calculations than an IV that's narrower than
Pmode.
- Doing the comparison in IV_PRECISION or wider allows
a natural 0-based IV, whereas using a narrower comparison
type requires mitigations against wrap-around.
a natural 0-based IV, whereas using a narrower comparison
type requires mitigations against wrap-around.
Conversely, if the IV limit is variable, doing the comparison
in a wider type than the original type can introduce