[AArch64] Use vec_perm_indices helper routines

This patch makes the AArch64 vec_perm_const code use the new
vec_perm_indices routines, instead of checking each element individually.
This means that they extend naturally to variable-length vectors.

Also, aarch64_evpc_dup was the only function that generated rtl when
testing_p is true, and that looked accidental.  The patch adds the
missing check and then replaces the gen_rtx_REG/start_sequence/
end_sequence stuff with an assert that no rtl is generated.

2018-01-09  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* config/aarch64/aarch64.c (aarch64_evpc_trn): Use d.perm.series_p
	instead of checking each element individually.
	(aarch64_evpc_uzp): Likewise.
	(aarch64_evpc_zip): Likewise.
	(aarch64_evpc_ext): Likewise.
	(aarch64_evpc_rev): Likewise.
	(aarch64_evpc_dup): Test the encoding for a single duplicated element,
	instead of checking each element individually.  Return true without
	generating rtl if
	(aarch64_vectorize_vec_perm_const): Use all_from_input_p to test
	whether all selected elements come from the same input, instead of
	checking each element individually.  Remove calls to gen_rtx_REG,
	start_sequence and end_sequence and instead assert that no rtl is
	generated.

From-SVN: r256385
This commit is contained in:
Richard Sandiford 2018-01-09 14:31:55 +00:00 committed by Richard Sandiford
parent 509bb9b647
commit 326ac20ea3
2 changed files with 67 additions and 123 deletions

View File

@ -1,3 +1,20 @@
2018-01-09 Richard Sandiford <richard.sandiford@linaro.org>
* config/aarch64/aarch64.c (aarch64_evpc_trn): Use d.perm.series_p
instead of checking each element individually.
(aarch64_evpc_uzp): Likewise.
(aarch64_evpc_zip): Likewise.
(aarch64_evpc_ext): Likewise.
(aarch64_evpc_rev): Likewise.
(aarch64_evpc_dup): Test the encoding for a single duplicated element,
instead of checking each element individually. Return true without
generating rtl if
(aarch64_vectorize_vec_perm_const): Use all_from_input_p to test
whether all selected elements come from the same input, instead of
checking each element individually. Remove calls to gen_rtx_REG,
start_sequence and end_sequence and instead assert that no rtl is
generated.
2018-01-09 Richard Sandiford <richard.sandiford@linaro.org>
* config/aarch64/aarch64.c (aarch64_legitimate_constant_p): Fix

View File

@ -13317,7 +13317,7 @@ aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
static bool
aarch64_evpc_trn (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->perm.length ();
unsigned int odd, nelt = d->perm.length ();
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@ -13326,21 +13326,11 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
/* Note that these are little-endian tests.
We correct for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
odd = d->perm[0];
if ((odd != 0 && odd != 1)
|| !d->perm.series_p (0, 2, odd, 2)
|| !d->perm.series_p (1, 2, nelt + odd, 2))
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i += 2)
{
if (d->perm[i] != i + odd)
return false;
if (d->perm[i + 1] != ((i + nelt + odd) & mask))
return false;
}
/* Success! */
if (d->testing_p)
@ -13364,7 +13354,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
static bool
aarch64_evpc_uzp (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->perm.length ();
unsigned int odd;
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@ -13373,20 +13363,10 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
/* Note that these are little-endian tests.
We correct for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
odd = d->perm[0];
if ((odd != 0 && odd != 1)
|| !d->perm.series_p (0, 1, odd, 2))
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i++)
{
unsigned elt = (i * 2 + odd) & mask;
if (d->perm[i] != elt)
return false;
}
/* Success! */
if (d->testing_p)
@ -13410,7 +13390,7 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
static bool
aarch64_evpc_zip (struct expand_vec_perm_d *d)
{
unsigned int i, high, mask, nelt = d->perm.length ();
unsigned int high, nelt = d->perm.length ();
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@ -13419,25 +13399,11 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
/* Note that these are little-endian tests.
We correct for big-endian later. */
high = nelt / 2;
if (d->perm[0] == high)
/* Do Nothing. */
;
else if (d->perm[0] == 0)
high = 0;
else
high = d->perm[0];
if ((high != 0 && high * 2 != nelt)
|| !d->perm.series_p (0, 2, high, 1)
|| !d->perm.series_p (1, 2, high + nelt, 1))
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt / 2; i++)
{
unsigned elt = (i + high) & mask;
if (d->perm[i * 2] != elt)
return false;
elt = (elt + nelt) & mask;
if (d->perm[i * 2 + 1] != elt)
return false;
}
/* Success! */
if (d->testing_p)
@ -13462,23 +13428,14 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
static bool
aarch64_evpc_ext (struct expand_vec_perm_d *d)
{
unsigned int i, nelt = d->perm.length ();
unsigned int nelt = d->perm.length ();
rtx offset;
unsigned int location = d->perm[0]; /* Always < nelt. */
/* Check if the extracted indices are increasing by one. */
for (i = 1; i < nelt; i++)
{
unsigned int required = location + i;
if (d->one_vector_p)
{
/* We'll pass the same vector in twice, so allow indices to wrap. */
required &= (nelt - 1);
}
if (d->perm[i] != required)
return false;
}
if (!d->perm.series_p (0, 1, location, 1))
return false;
/* Success! */
if (d->testing_p)
@ -13510,7 +13467,7 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
static bool
aarch64_evpc_rev (struct expand_vec_perm_d *d)
{
unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
unsigned int i, diff, size, unspec;
if (!d->one_vector_p)
return false;
@ -13526,18 +13483,10 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
else
return false;
for (i = 0; i < nelt ; i += diff + 1)
for (j = 0; j <= diff; j += 1)
{
/* This is guaranteed to be true as the value of diff
is 7, 3, 1 and we should have enough elements in the
queue to generate this. Getting a vector mask with a
value of diff other than these values implies that
something is wrong by the time we get here. */
gcc_assert (i + j < nelt);
if (d->perm[i + j] != i + diff - j)
return false;
}
unsigned int step = diff + 1;
for (i = 0; i < step; ++i)
if (!d->perm.series_p (i, step, diff - i, step))
return false;
/* Success! */
if (d->testing_p)
@ -13554,15 +13503,17 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
rtx out = d->target;
rtx in0;
machine_mode vmode = d->vmode;
unsigned int i, elt, nelt = d->perm.length ();
unsigned int elt;
rtx lane;
if (d->perm.encoding ().encoded_nelts () != 1)
return false;
/* Success! */
if (d->testing_p)
return true;
elt = d->perm[0];
for (i = 1; i < nelt; i++)
{
if (elt != d->perm[i])
return false;
}
/* The generic preparation in aarch64_expand_vec_perm_const_1
swaps the operand order and the permute indices if it finds
@ -13650,61 +13601,37 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
rtx op1, const vec_perm_indices &sel)
{
struct expand_vec_perm_d d;
unsigned int i, which;
/* Check whether the mask can be applied to a single vector. */
if (op0 && rtx_equal_p (op0, op1))
d.one_vector_p = true;
else if (sel.all_from_input_p (0))
{
d.one_vector_p = true;
op1 = op0;
}
else if (sel.all_from_input_p (1))
{
d.one_vector_p = true;
op0 = op1;
}
else
d.one_vector_p = false;
d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
sel.nelts_per_input ());
d.vmode = vmode;
d.target = target;
d.op0 = op0;
d.op1 = op1;
d.testing_p = !target;
/* Calculate whether all elements are in one vector. */
unsigned int nelt = sel.length ();
for (i = which = 0; i < nelt; ++i)
{
unsigned int ei = sel[i] & (2 * nelt - 1);
which |= (ei < nelt ? 1 : 2);
}
switch (which)
{
default:
gcc_unreachable ();
case 3:
d.one_vector_p = false;
if (d.testing_p || !rtx_equal_p (op0, op1))
break;
/* The elements of PERM do not suggest that only the first operand
is used, but both operands are identical. Allow easier matching
of the permutation by folding the permutation into the single
input vector. */
/* Fall Through. */
case 2:
d.op0 = op1;
d.one_vector_p = true;
break;
case 1:
d.op1 = op0;
d.one_vector_p = true;
break;
}
d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, nelt);
if (!d.testing_p)
return aarch64_expand_vec_perm_const_1 (&d);
d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
if (!d.one_vector_p)
d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
start_sequence ();
rtx_insn *last = get_last_insn ();
bool ret = aarch64_expand_vec_perm_const_1 (&d);
end_sequence ();
gcc_assert (last == get_last_insn ());
return ret;
}