re PR target/88838 ([SVE] Use 32-bit WHILELO in LP64 mode)

gcc/ChangeLog:

2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	PR target/88838
	* tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
	compare_type is not with Pmode size, we will create an IV with
	Pmode size with truncated use (i.e. converted to the correct type).
	* tree-vect-loop.c (vect_verify_full_masking): Find IV type.
	(vect_iv_limit_for_full_masking): New. Factored out of
	vect_set_loop_condition_masked.
	* tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New.
	(vect_iv_limit_for_full_masking): Declare.

gcc/testsuite/ChangeLog:

2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	PR target/88838
	* gcc.target/aarch64/pr88838.c: New test.
	* gcc.target/aarch64/sve/while_1.c: Adjust.

From-SVN: r272233
This commit is contained in:
Kugan Vivekanandarajah 2019-06-13 03:34:28 +00:00 committed by Kugan Vivekanandarajah
parent fa9863e7d3
commit 9b884225bf
7 changed files with 138 additions and 44 deletions

View File

@ -1,3 +1,15 @@
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88838
* tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
compare_type is not with Pmode size, we will create an IV with
Pmode size with truncated use (i.e. converted to the correct type).
* tree-vect-loop.c (vect_verify_full_masking): Find IV type.
(vect_iv_limit_for_full_masking): New. Factored out of
vect_set_loop_condition_masked.
* tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New.
(vect_iv_limit_for_full_masking): Declare.
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88834

View File

@ -1,3 +1,9 @@
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88838
* gcc.target/aarch64/pr88838.c: New test.
* gcc.target/aarch64/sve/while_1.c: Adjust.
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88834

View File

@ -0,0 +1,11 @@
/* { dg-do compile } */
/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
void
f (int *restrict x, int *restrict y, int *restrict z, int n)
{
for (int i = 0; i < n; i += 1)
x[i] = y[i] + z[i];
}
/* { dg-final { scan-assembler-not "sxtw" } } */

View File

@ -26,14 +26,14 @@
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-not {\tuqdec} } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */

View File

@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
bool might_wrap_p)
{
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
tree mask_type = rgm->mask_type;
unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
tree index_before_incr, index_after_incr;
gimple_stmt_iterator incr_gsi;
bool insert_after;
tree zero_index = build_int_cst (compare_type, 0);
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
tree zero_index = build_int_cst (iv_type, 0);
tree step = build_int_cst (iv_type,
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
/* Create IV of iv_type. */
create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);
zero_index = build_int_cst (compare_type, 0);
tree test_index, test_limit, first_limit;
gimple_stmt_iterator *test_gsi;
if (might_wrap_p)
@ -529,6 +535,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
tree next_mask = NULL_TREE;
tree mask;
unsigned int i;
gimple_seq test_seq = NULL;
test_index = gimple_convert (&test_seq, compare_type, test_index);
gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
{
/* Previous masks will cover BIAS scalars. This mask covers the
@ -637,12 +647,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
tree orig_niters = niters;
/* Type of the initial value of NITERS. */
tree ni_actual_type = TREE_TYPE (niters);
unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
/* Convert NITERS to the same size as the compare. */
if (compare_precision > ni_actual_precision
@ -661,33 +671,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
else
niters = gimple_convert (&preheader_seq, compare_type, niters);
/* Convert skip_niters to the right type. */
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
/* Now calculate the value that the induction variable must be able
to hit in order to ensure that we end the loop with an all-false mask.
This involves adding the maximum number of inactive trailing scalar
iterations. */
widest_int iv_limit;
bool known_max_iters = max_loop_iterations (loop, &iv_limit);
if (known_max_iters)
{
if (niters_skip)
{
/* Add the maximum number of skipped iterations to the
maximum iteration count. */
if (TREE_CODE (niters_skip) == INTEGER_CST)
iv_limit += wi::to_widest (niters_skip);
else
iv_limit += max_vf - 1;
}
/* IV_LIMIT is the maximum number of latch iterations, which is also
the maximum in-range IV value. Round this value down to the previous
vector alignment boundary and then add an extra full iteration. */
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
}
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
/* Get the vectorization factor in tree form. */
tree vf = build_int_cst (compare_type,
LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@ -717,7 +701,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
/* See whether zero-based IV would ever generate all-false masks
before wrapping around. */
bool might_wrap_p
= (!known_max_iters
= (iv_limit == -1
|| (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
UNSIGNED)
> compare_precision));

View File

@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int min_ni_width;
unsigned int max_nscalars_per_iter
= vect_get_max_nscalars_per_iter (loop_vinfo);
/* Use a normal loop if there are no statements that need masking.
This only happens in rare degenerate cases: it means that the loop
@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
max_ni = wi::smin (max_ni, max_back_edges + 1);
/* Account for rgroup masks, in which each bit is replicated N times. */
max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
max_ni *= max_nscalars_per_iter;
/* Work out how many bits we need to represent the limit. */
min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
/* Find a scalar mode for which WHILE_ULT is supported. */
opt_scalar_int_mode cmp_mode_iter;
tree cmp_type = NULL_TREE;
tree iv_type = NULL_TREE;
widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
widest_int iv_precision = UINT_MAX;
if (iv_limit != -1)
iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
UNSIGNED);
FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
{
unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
&& can_produce_all_loop_masks_p (loop_vinfo, this_type))
{
/* Although we could stop as soon as we find a valid mode,
it's often better to continue until we hit Pmode, since the
operands to the WHILE are more likely to be reusable in
address calculations. */
cmp_type = this_type;
there are at least two reasons why that's not always the
best choice:
- An IV that's Pmode or wider is more likely to be reusable
in address calculations than an IV that's narrower than
Pmode.
- Doing the comparison in IV_PRECISION or wider allows
a natural 0-based IV, whereas using a narrower comparison
type requires mitigations against wrap-around.
Conversely, if the IV limit is variable, doing the comparison
in a wider type than the original type can introduce
unnecessary extensions, so picking the widest valid mode
is not always a good choice either.
Here we prefer the first IV type that's Pmode or wider,
and the first comparison type that's IV_PRECISION or wider.
(The comparison type must be no wider than the IV type,
to avoid extensions in the vector loop.)
??? We might want to try continuing beyond Pmode for ILP32
targets if CMP_BITS < IV_PRECISION. */
iv_type = this_type;
if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
cmp_type = this_type;
if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
break;
}
@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
return false;
LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
return true;
}
@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop)
add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
}
}
/* Decide whether it is possible to use a zero-based induction variable
when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
return the value that the induction variable must be able to hold
in order to ensure that the loop ends with an all-false mask.
Return -1 otherwise. */
widest_int
vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
{
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
/* Calculate the value that the induction variable must be able
to hit in order to ensure that we end the loop with an all-false mask.
This involves adding the maximum number of inactive trailing scalar
iterations. */
widest_int iv_limit = -1;
if (max_loop_iterations (loop, &iv_limit))
{
if (niters_skip)
{
/* Add the maximum number of skipped iterations to the
maximum iteration count. */
if (TREE_CODE (niters_skip) == INTEGER_CST)
iv_limit += wi::to_widest (niters_skip);
else
iv_limit += max_vf - 1;
}
else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
/* Make a conservatively-correct assumption. */
iv_limit += max_vf - 1;
/* IV_LIMIT is the maximum number of latch iterations, which is also
the maximum in-range IV value. Round this value down to the previous
vector alignment boundary and then add an extra full iteration. */
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
}
return iv_limit;
}

View File

@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
is false and vectorized loop otherwise. */
tree simd_if_cond;
/* Type of the IV to use in the WHILE_ULT call for fully-masked
loops. */
tree iv_type;
/* Unknown DRs according to which loop was peeled. */
struct dr_vec_info *unaligned_dr;
@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
#define LOOP_VINFO_MASKS(L) (L)->masks
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type
#define LOOP_VINFO_MASK_IV_TYPE(L) (L)->iv_type
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
#define LOOP_VINFO_LOOP_NEST(L) (L)->shared->loop_nest
#define LOOP_VINFO_DATAREFS(L) (L)->shared->datarefs
@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
/* FORNOW: Used in tree-parloops.c. */
extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
bool *, bool);
extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
/* Used in gimple-loop-interchange.c. */
extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
enum tree_code);