[AArch64] Handle more SVE predicate constants

This patch handles more predicate constants by using TRN1, TRN2
and EOR.  For now, only one operation is allowed before we fall
back to loading from memory or doing an integer move and a compare.
The EOR support includes the important special case of an inverted
predicate.

The real motivating case for this is the ACLE svdupq function,
which allows a repeating 16-bit predicate to be built from
individual scalar booleans.  It's not easy to test properly
before that support is merged.

2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/aarch64.c (aarch64_expand_sve_const_pred_eor)
	(aarch64_expand_sve_const_pred_trn): New functions.
	(aarch64_expand_sve_const_pred_1): Add a recurse_p parameter and
	use the above functions when the parameter is true.
	(aarch64_expand_sve_const_pred): Update call accordingly.
	* config/aarch64/aarch64-sve.md (*aarch64_sve_<perm_insn><mode>):
	Rename to...
	(@aarch64_sve_<perm_insn><mode>): ...this.

gcc/testsuite/
	* gcc.target/aarch64/sve/peel_ind_1.c: Look for an inverted .B VL1.
	* gcc.target/aarch64/sve/peel_ind_2.c: Likewise .S VL7.

From-SVN: r274434
This commit is contained in:
Richard Sandiford 2019-08-14 08:54:33 +00:00 committed by Richard Sandiford
parent df4ac85fd9
commit 2803bc3bbc
6 changed files with 188 additions and 4 deletions

View File

@ -1,3 +1,14 @@
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/aarch64.c (aarch64_expand_sve_const_pred_eor)
(aarch64_expand_sve_const_pred_trn): New functions.
(aarch64_expand_sve_const_pred_1): Add a recurse_p parameter and
use the above functions when the parameter is true.
(aarch64_expand_sve_const_pred): Update call accordingly.
* config/aarch64/aarch64-sve.md (*aarch64_sve_<perm_insn><mode>):
Rename to...
(@aarch64_sve_<perm_insn><mode>): ...this.
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/aarch64-protos.h (aarch64_sve_same_pred_for_ptest_p):

View File

@ -3676,7 +3676,7 @@
;; Permutes that take half the elements from one vector and half the
;; elements from the other.
(define_insn "*aarch64_sve_<perm_insn><mode>"
(define_insn "@aarch64_sve_<perm_insn><mode>"
[(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")
(match_operand:PRED_ALL 2 "register_operand" "Upa")]

View File

@ -3751,13 +3751,163 @@ aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
return target;
}
static rtx
aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
/* BUILDER is a constant predicate in which the index of every set bit
is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
by inverting every element at a multiple of ELT_SIZE and EORing the
result with an ELT_SIZE PTRUE.
Return a register that contains the constant on success, otherwise
return null. Use TARGET as the register if it is nonnull and
convenient. */
static rtx
aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
unsigned int elt_size)
{
/* Invert every element at a multiple of ELT_SIZE, keeping the
other bits zero. */
rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
builder.nelts_per_pattern ());
for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
inv_builder.quick_push (const1_rtx);
else
inv_builder.quick_push (const0_rtx);
inv_builder.finalize ();
/* See if we can load the constant cheaply. */
rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
if (!inv)
return NULL_RTX;
/* EOR the result with an ELT_SIZE PTRUE. */
rtx mask = aarch64_ptrue_all (elt_size);
mask = force_reg (VNx16BImode, mask);
target = aarch64_target_reg (target, VNx16BImode);
emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
return target;
}
/* BUILDER is a constant predicate in which the index of every set bit
is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
register on success, otherwise return null. Use TARGET as the register
if nonnull and convenient. */
static rtx
aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
unsigned int elt_size,
unsigned int permute_size)
{
/* We're going to split the constant into two new constants A and B,
with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
where _ indicates elements that will be discarded by the permute.
First calculate the ELT_SIZEs for A and B. */
unsigned int a_elt_size = GET_MODE_SIZE (DImode);
unsigned int b_elt_size = GET_MODE_SIZE (DImode);
for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
if (INTVAL (builder.elt (i)) != 0)
{
if (i & permute_size)
b_elt_size |= i - permute_size;
else
a_elt_size |= i;
}
a_elt_size &= -a_elt_size;
b_elt_size &= -b_elt_size;
/* Now construct the vectors themselves. */
rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
builder.nelts_per_pattern ());
rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
builder.nelts_per_pattern ());
unsigned int nelts = builder.encoded_nelts ();
for (unsigned int i = 0; i < nelts; ++i)
if (i & (elt_size - 1))
{
a_builder.quick_push (const0_rtx);
b_builder.quick_push (const0_rtx);
}
else if ((i & permute_size) == 0)
{
/* The A and B elements are significant. */
a_builder.quick_push (builder.elt (i));
b_builder.quick_push (builder.elt (i + permute_size));
}
else
{
/* The A and B elements are going to be discarded, so pick whatever
is likely to give a nice constant. We are targeting element
sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
with the aim of each being a sequence of ones followed by
a sequence of zeros. So:
* if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
duplicate the last X_ELT_SIZE element, to extend the
current sequence of ones or zeros.
* if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
zero, so that the constant really does have X_ELT_SIZE and
not a smaller size. */
if (a_elt_size > permute_size)
a_builder.quick_push (const0_rtx);
else
a_builder.quick_push (a_builder.elt (i - a_elt_size));
if (b_elt_size > permute_size)
b_builder.quick_push (const0_rtx);
else
b_builder.quick_push (b_builder.elt (i - b_elt_size));
}
a_builder.finalize ();
b_builder.finalize ();
/* Try loading A into a register. */
rtx_insn *last = get_last_insn ();
rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
if (!a)
return NULL_RTX;
/* Try loading B into a register. */
rtx b = a;
if (a_builder != b_builder)
{
b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
if (!b)
{
delete_insns_since (last);
return NULL_RTX;
}
}
/* Emit the TRN1 itself. */
machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
target = aarch64_target_reg (target, mode);
emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
gen_lowpart (mode, a),
gen_lowpart (mode, b)));
return target;
}
/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
constant in BUILDER into an SVE predicate register. Return the register
on success, otherwise return null. Use TARGET for the register if
nonnull and convenient. */
nonnull and convenient.
ALLOW_RECURSE_P is true if we can use methods that would call this
function recursively. */
static rtx
aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
bool allow_recurse_p)
{
if (builder.encoded_nelts () == 1)
/* A PFALSE or a PTRUE .B ALL. */
@ -3775,6 +3925,22 @@ aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
return aarch64_sve_move_pred_via_while (target, mode, vl);
}
if (!allow_recurse_p)
return NULL_RTX;
/* Try inverting the vector in element size ELT_SIZE and then EORing
the result with an ELT_SIZE PTRUE. */
if (INTVAL (builder.elt (0)) == 0)
if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
elt_size))
return res;
/* Try using TRN1 to permute two simpler constants. */
for (unsigned int i = elt_size; i <= 8; i *= 2)
if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
elt_size, i))
return res;
return NULL_RTX;
}
@ -3789,7 +3955,7 @@ static rtx
aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
{
/* Try loading the constant using pure predicate operations. */
if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder))
if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
return res;
/* Try forcing the constant to memory. */

View File

@ -1,3 +1,8 @@
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/peel_ind_1.c: Look for an inverted .B VL1.
* gcc.target/aarch64/sve/peel_ind_2.c: Likewise .S VL7.
2019-08-14 Paolo Carlini <paolo.carlini@oracle.com>
* g++.dg/parse/typedef9.C: Test locations too.

View File

@ -25,3 +25,4 @@ foo (void)
/* We should use an induction that starts at -5, with only the last
7 elements of the first iteration being active. */
/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */
/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+\.b), vl1\n.*\tnot\tp[0-7]\.b, p[0-7]/z, \1\n} } } */

View File

@ -20,3 +20,4 @@ foo (void)
/* { dg-final { scan-assembler {\t(adrp|adr)\tx[0-9]+, x\n} } } */
/* We should unroll the loop three times. */
/* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */
/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.s, vl7\n.*\teor\tp[0-7]\.b, (p[0-7])/z, (\1\.b, \2\.b|\2\.b, \1\.b)\n} } } */