arm: Implement MVE predicates as vectors of booleans

This patch implements support for vectors of booleans to support MVE
predicates, instead of HImode.  Since the ABI mandates pred16_t (aka
uint16_t) to represent predicates in intrinsics prototypes, we
introduce a new "predicate" type qualifier so that we can map relevant
builtins HImode arguments and return value to the appropriate vector
of booleans (VxBI).

We have to update test_vector_ops_duplicate, because it iterates using
an offset in bytes, where we would need to iterate in bits: we stop
iterating when we reach the end of the vector of booleans.

In addition, we have to fix the underlying definition of vectors of
booleans because ARM/MVE needs a different representation than
AArch64/SVE. With ARM/MVE the 'true' bit is duplicated over the
element size, so that a true element of V4BI is represented by
'0b1111'.  This patch updates the aarch64 definition of VNx*BI as
needed.

Most of the work of this patch series was carried out while I was
working at STMicroelectronics as a Linaro assignee.

2022-02-22  Christophe Lyon  <christophe.lyon@arm.com>
	    Richard Sandiford  <richard.sandiford@arm.com>

	gcc/
	PR target/100757
	PR target/101325
	* config/aarch64/aarch64-modes.def (VNx16BI, VNx8BI, VNx4BI,
	VNx2BI): Update definition.
	* config/arm/arm-builtins.cc (arm_init_simd_builtin_types): Add new
	simd types.
	(arm_init_builtin): Map predicate vectors arguments to HImode.
	(arm_expand_builtin_args): Move HImode predicate arguments to VxBI
	rtx. Move return value to HImode rtx.
	* config/arm/arm-builtins.h (arm_type_qualifiers): Add qualifier_predicate.
	* config/arm/arm-modes.def (B2I, B4I, V16BI, V8BI, V4BI): New modes.
	* config/arm/arm-simd-builtin-types.def (Pred1x16_t,
	Pred2x8_t,Pred4x4_t): New.
	* emit-rtl.cc (init_emit_once): Handle all boolean modes.
	* genmodes.cc (mode_data): Add boolean field.
	(blank_mode): Initialize it.
	(make_complex_modes): Fix handling of boolean modes.
	(make_vector_modes): Likewise.
	(VECTOR_BOOL_MODE): Use new COMPONENT parameter.
	(make_vector_bool_mode): Likewise.
	(BOOL_MODE): New.
	(make_bool_mode): New.
	(emit_insn_modes_h): Fix generation of boolean modes.
	(emit_class_narrowest_mode): Likewise.
	* machmode.def: (VECTOR_BOOL_MODE): Document new COMPONENT
	parameter.  Use new BOOL_MODE instead of FRACTIONAL_INT_MODE to
	define BImode.
	* rtx-vector-builder.cc (rtx_vector_builder::find_cached_value):
	Fix handling of constm1_rtx for VECTOR_BOOL.
	* simplify-rtx.cc (native_encode_rtx): Fix support for VECTOR_BOOL.
	(native_decode_vector_rtx): Likewise.
	(test_vector_ops_duplicate): Skip vec_merge test
	with vectors of booleans.
	* varasm.cc (output_constant_pool_2): Likewise.
This commit is contained in:
Christophe Lyon 2021-10-13 09:16:22 +00:00 committed by Christophe Lyon
parent 0d0aaea105
commit 884f77b422
11 changed files with 162 additions and 56 deletions

View File

@ -47,10 +47,10 @@ ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
/* Vector modes. */
VECTOR_BOOL_MODE (VNx16BI, 16, 2);
VECTOR_BOOL_MODE (VNx8BI, 8, 2);
VECTOR_BOOL_MODE (VNx4BI, 4, 2);
VECTOR_BOOL_MODE (VNx2BI, 2, 2);
VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2);
VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2);
VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2);
VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2);
ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8);
ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4);

View File

@ -1553,11 +1553,28 @@ arm_init_simd_builtin_types (void)
tree eltype = arm_simd_types[i].eltype;
machine_mode mode = arm_simd_types[i].mode;
if (eltype == NULL)
if (eltype == NULL
/* VECTOR_BOOL is not supported unless MVE is activated,
this would make build_truth_vector_type_for_mode
crash. */
&& ((GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
|| !TARGET_HAVE_MVE))
continue;
if (arm_simd_types[i].itype == NULL)
{
tree type = build_vector_type (eltype, GET_MODE_NUNITS (mode));
tree type;
if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
{
/* Handle MVE predicates: they are internally stored as
16 bits, but are used as vectors of 1, 2 or 4-bit
elements. */
type = build_truth_vector_type_for_mode (GET_MODE_NUNITS (mode),
mode);
eltype = TREE_TYPE (type);
}
else
type = build_vector_type (eltype, GET_MODE_NUNITS (mode));
type = build_distinct_type_copy (type);
SET_TYPE_STRUCTURAL_EQUALITY (type);
@ -1695,6 +1712,11 @@ arm_init_builtin (unsigned int fcode, arm_builtin_datum *d,
if (qualifiers & qualifier_map_mode)
op_mode = d->mode;
/* MVE Predicates use HImode as mandated by the ABI: pred16_t is
unsigned short. */
if (qualifiers & qualifier_predicate)
op_mode = HImode;
/* For pointers, we want a pointer to the basic type
of the vector. */
if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
@ -2939,6 +2961,12 @@ arm_expand_builtin_args (rtx target, machine_mode map_mode, int fcode,
case ARG_BUILTIN_COPY_TO_REG:
if (POINTER_TYPE_P (TREE_TYPE (arg[argc])))
op[argc] = convert_memory_address (Pmode, op[argc]);
/* MVE uses mve_pred16_t (aka HImode) for vectors of
predicates. */
if (GET_MODE_CLASS (mode[argc]) == MODE_VECTOR_BOOL)
op[argc] = gen_lowpart (mode[argc], op[argc]);
/*gcc_assert (GET_MODE (op[argc]) == mode[argc]); */
if (!(*insn_data[icode].operand[opno].predicate)
(op[argc], mode[argc]))
@ -3144,6 +3172,13 @@ constant_arg:
else
emit_insn (insn);
if (GET_MODE_CLASS (tmode) == MODE_VECTOR_BOOL)
{
rtx HItarget = gen_reg_rtx (HImode);
emit_move_insn (HItarget, gen_lowpart (HImode, target));
return HItarget;
}
return target;
}

View File

@ -84,7 +84,9 @@ enum arm_type_qualifiers
qualifier_lane_pair_index = 0x1000,
/* Lane indices selected in quadtuplets - must be within range of previous
argument = a vector. */
qualifier_lane_quadtup_index = 0x2000
qualifier_lane_quadtup_index = 0x2000,
/* MVE vector predicates. */
qualifier_predicate = 0x4000
};
struct arm_simd_type_info

View File

@ -84,6 +84,14 @@ VECTOR_MODE (FLOAT, BF, 2); /* V2BF. */
VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */
VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */
/* Predicates for MVE. */
BOOL_MODE (B2I, 2, 1);
BOOL_MODE (B4I, 4, 1);
VECTOR_BOOL_MODE (V16BI, 16, BI, 2);
VECTOR_BOOL_MODE (V8BI, 8, B2I, 2);
VECTOR_BOOL_MODE (V4BI, 4, B4I, 2);
/* Fraction and accumulator vector modes. */
VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */
VECTOR_MODES (UFRACT, 4); /* V4UQQ V2UHQ */

View File

@ -51,3 +51,7 @@
ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
ENTRY (Pred1x16_t, V16BI, predicate, 16, pred1, 16)
ENTRY (Pred2x8_t, V8BI, predicate, 8, pred1, 15)
ENTRY (Pred4x4_t, V4BI, predicate, 4, pred1, 15)

View File

@ -6239,9 +6239,22 @@ init_emit_once (void)
/* For BImode, 1 and -1 are unsigned and signed interpretations
of the same value. */
const_tiny_rtx[0][(int) BImode] = const0_rtx;
const_tiny_rtx[1][(int) BImode] = const_true_rtx;
const_tiny_rtx[3][(int) BImode] = const_true_rtx;
for (mode = MIN_MODE_BOOL;
mode <= MAX_MODE_BOOL;
mode = (machine_mode)((int)(mode) + 1))
{
const_tiny_rtx[0][(int) mode] = const0_rtx;
if (mode == BImode)
{
const_tiny_rtx[1][(int) mode] = const_true_rtx;
const_tiny_rtx[3][(int) mode] = const_true_rtx;
}
else
{
const_tiny_rtx[1][(int) mode] = const1_rtx;
const_tiny_rtx[3][(int) mode] = constm1_rtx;
}
}
for (mode = MIN_MODE_PARTIAL_INT;
mode <= MAX_MODE_PARTIAL_INT;
@ -6260,13 +6273,16 @@ init_emit_once (void)
const_tiny_rtx[0][(int) mode] = gen_rtx_CONCAT (mode, inner, inner);
}
/* As for BImode, "all 1" and "all -1" are unsigned and signed
interpretations of the same value. */
FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_BOOL)
{
const_tiny_rtx[0][(int) mode] = gen_const_vector (mode, 0);
const_tiny_rtx[3][(int) mode] = gen_const_vector (mode, 3);
const_tiny_rtx[1][(int) mode] = const_tiny_rtx[3][(int) mode];
if (GET_MODE_INNER (mode) == BImode)
/* As for BImode, "all 1" and "all -1" are unsigned and signed
interpretations of the same value. */
const_tiny_rtx[1][(int) mode] = const_tiny_rtx[3][(int) mode];
else
const_tiny_rtx[1][(int) mode] = gen_const_vector (mode, 1);
}
FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)

View File

@ -78,6 +78,7 @@ struct mode_data
bool need_bytesize_adj; /* true if this mode needs dynamic size
adjustment */
unsigned int int_n; /* If nonzero, then __int<INT_N> will be defined */
bool boolean;
};
static struct mode_data *modes[MAX_MODE_CLASS];
@ -88,7 +89,8 @@ static const struct mode_data blank_mode = {
0, "<unknown>", MAX_MODE_CLASS,
0, -1U, -1U, -1U, -1U,
0, 0, 0, 0, 0, 0,
"<unknown>", 0, 0, 0, 0, false, false, 0
"<unknown>", 0, 0, 0, 0, false, false, 0,
false
};
static htab_t modes_by_name;
@ -456,7 +458,7 @@ make_complex_modes (enum mode_class cl,
size_t m_len;
/* Skip BImode. FIXME: BImode probably shouldn't be MODE_INT. */
if (m->precision == 1)
if (m->boolean)
continue;
m_len = strlen (m->name);
@ -528,7 +530,7 @@ make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
not be necessary. */
if (cl == MODE_FLOAT && m->bytesize == 1)
continue;
if (cl == MODE_INT && m->precision == 1)
if (m->boolean)
continue;
if ((size_t) snprintf (buf, sizeof buf, "%s%u%s", prefix,
@ -548,17 +550,18 @@ make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
/* Create a vector of booleans called NAME with COUNT elements and
BYTESIZE bytes in total. */
#define VECTOR_BOOL_MODE(NAME, COUNT, BYTESIZE) \
make_vector_bool_mode (#NAME, COUNT, BYTESIZE, __FILE__, __LINE__)
#define VECTOR_BOOL_MODE(NAME, COUNT, COMPONENT, BYTESIZE) \
make_vector_bool_mode (#NAME, COUNT, #COMPONENT, BYTESIZE, \
__FILE__, __LINE__)
static void ATTRIBUTE_UNUSED
make_vector_bool_mode (const char *name, unsigned int count,
unsigned int bytesize, const char *file,
unsigned int line)
const char *component, unsigned int bytesize,
const char *file, unsigned int line)
{
struct mode_data *m = find_mode ("BI");
struct mode_data *m = find_mode (component);
if (!m)
{
error ("%s:%d: no mode \"BI\"", file, line);
error ("%s:%d: no mode \"%s\"", file, line, component);
return;
}
@ -596,6 +599,20 @@ make_int_mode (const char *name,
m->precision = precision;
}
#define BOOL_MODE(N, B, Y) \
make_bool_mode (#N, B, Y, __FILE__, __LINE__)
static void
make_bool_mode (const char *name,
unsigned int precision, unsigned int bytesize,
const char *file, unsigned int line)
{
struct mode_data *m = new_mode (MODE_INT, name, file, line);
m->bytesize = bytesize;
m->precision = precision;
m->boolean = true;
}
#define OPAQUE_MODE(N, B) \
make_opaque_mode (#N, -1U, B, __FILE__, __LINE__)
@ -1298,9 +1315,21 @@ enum machine_mode\n{");
/* Don't use BImode for MIN_MODE_INT, since otherwise the middle
end will try to use it for bitfields in structures and the
like, which we do not want. Only the target md file should
generate BImode widgets. */
if (first && first->precision == 1 && c == MODE_INT)
first = first->next;
generate BImode widgets. Since some targets such as ARM/MVE
define boolean modes with multiple bits, handle those too. */
if (first && first->boolean)
{
struct mode_data *last_bool = first;
printf (" MIN_MODE_BOOL = E_%smode,\n", first->name);
while (first && first->boolean)
{
last_bool = first;
first = first->next;
}
printf (" MAX_MODE_BOOL = E_%smode,\n\n", last_bool->name);
}
if (first && last)
printf (" MIN_%s = E_%smode,\n MAX_%s = E_%smode,\n\n",
@ -1679,15 +1708,15 @@ emit_class_narrowest_mode (void)
print_decl ("unsigned char", "class_narrowest_mode", "MAX_MODE_CLASS");
for (c = 0; c < MAX_MODE_CLASS; c++)
/* Bleah, all this to get the comment right for MIN_MODE_INT. */
tagged_printf ("MIN_%s", mode_class_names[c],
modes[c]
? ((c != MODE_INT || modes[c]->precision != 1)
? modes[c]->name
: (modes[c]->next
? modes[c]->next->name
: void_mode->name))
: void_mode->name);
{
/* Bleah, all this to get the comment right for MIN_MODE_INT. */
struct mode_data *m = modes[c];
while (m && m->boolean)
m = m->next;
const char *comment_name = (m ? m : void_mode)->name;
tagged_printf ("MIN_%s", mode_class_names[c], comment_name);
}
print_closer ();
}

View File

@ -146,12 +146,13 @@ along with GCC; see the file COPYING3. If not see
Like VECTOR_MODES, but start the mode names with PREFIX instead
of the usual "V".
VECTOR_BOOL_MODE (NAME, COUNT, BYTESIZE)
VECTOR_BOOL_MODE (NAME, COUNT, COMPONENT, BYTESIZE)
Create a vector mode called NAME that contains COUNT boolean
elements and occupies BYTESIZE bytes in total. Each boolean
element occupies (COUNT * BITS_PER_UNIT) / BYTESIZE bits, with
the element at index 0 occupying the lsb of the first byte in
memory. Only the lowest bit of each element is significant.
element is of COMPONENT type and occupies (COUNT * BITS_PER_UNIT) /
BYTESIZE bits, with the element at index 0 occupying the lsb of the
first byte in memory. Only the lowest bit of each element is
significant.
OPAQUE_MODE (NAME, BYTESIZE)
Create an opaque mode called NAME that is BYTESIZE bytes wide.
@ -196,7 +197,7 @@ RANDOM_MODE (VOID);
RANDOM_MODE (BLK);
/* Single bit mode used for booleans. */
FRACTIONAL_INT_MODE (BI, 1, 1);
BOOL_MODE (BI, 1, 1);
/* Basic integer modes. We go up to TI in generic code (128 bits).
TImode is needed here because the some front ends now genericly

View File

@ -90,8 +90,10 @@ rtx_vector_builder::find_cached_value ()
if (GET_MODE_CLASS (m_mode) == MODE_VECTOR_BOOL)
{
if (elt == const1_rtx || elt == constm1_rtx)
if (elt == const1_rtx)
return CONST1_RTX (m_mode);
else if (elt == constm1_rtx)
return CONSTM1_RTX (m_mode);
else if (elt == const0_rtx)
return CONST0_RTX (m_mode);
else

View File

@ -6876,12 +6876,13 @@ native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes,
/* This is the only case in which elements can be smaller than
a byte. */
gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
auto mask = GET_MODE_MASK (GET_MODE_INNER (mode));
for (unsigned int i = 0; i < num_bytes; ++i)
{
target_unit value = 0;
for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
{
value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j;
value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & mask) << j;
elt += 1;
}
bytes.quick_push (value);
@ -7025,9 +7026,8 @@ native_decode_vector_rtx (machine_mode mode, const vec<target_unit> &bytes,
unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
unsigned int byte_index = bit_index / BITS_PER_UNIT;
unsigned int lsb = bit_index % BITS_PER_UNIT;
builder.quick_push (bytes[byte_index] & (1 << lsb)
? CONST1_RTX (BImode)
: CONST0_RTX (BImode));
unsigned int value = bytes[byte_index] >> lsb;
builder.quick_push (gen_int_mode (value, GET_MODE_INNER (mode)));
}
}
else
@ -7994,17 +7994,23 @@ test_vector_ops_duplicate (machine_mode mode, rtx scalar_reg)
duplicate, last_par));
/* Test a scalar subreg of a VEC_MERGE of a VEC_DUPLICATE. */
rtx vector_reg = make_test_reg (mode);
for (unsigned HOST_WIDE_INT i = 0; i < const_nunits; i++)
/* Skip this test for vectors of booleans, because offset is in bytes,
while vec_merge indices are in elements (usually bits). */
if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
{
if (i >= HOST_BITS_PER_WIDE_INT)
break;
rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1));
rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask);
poly_uint64 offset = i * GET_MODE_SIZE (inner_mode);
ASSERT_RTX_EQ (scalar_reg,
simplify_gen_subreg (inner_mode, vm,
mode, offset));
rtx vector_reg = make_test_reg (mode);
for (unsigned HOST_WIDE_INT i = 0; i < const_nunits; i++)
{
if (i >= HOST_BITS_PER_WIDE_INT)
break;
rtx mask = GEN_INT ((HOST_WIDE_INT_1U << i) | (i + 1));
rtx vm = gen_rtx_VEC_MERGE (mode, duplicate, vector_reg, mask);
poly_uint64 offset = i * GET_MODE_SIZE (inner_mode);
ASSERT_RTX_EQ (scalar_reg,
simplify_gen_subreg (inner_mode, vm,
mode, offset));
}
}
}

View File

@ -4085,6 +4085,7 @@ output_constant_pool_2 (fixed_size_mode mode, rtx x, unsigned int align)
unsigned int elt_bits = GET_MODE_BITSIZE (mode) / nelts;
unsigned int int_bits = MAX (elt_bits, BITS_PER_UNIT);
scalar_int_mode int_mode = int_mode_for_size (int_bits, 0).require ();
unsigned int mask = GET_MODE_MASK (GET_MODE_INNER (mode));
/* Build the constant up one integer at a time. */
unsigned int elts_per_int = int_bits / elt_bits;
@ -4093,8 +4094,10 @@ output_constant_pool_2 (fixed_size_mode mode, rtx x, unsigned int align)
unsigned HOST_WIDE_INT value = 0;
unsigned int limit = MIN (nelts - i, elts_per_int);
for (unsigned int j = 0; j < limit; ++j)
if (INTVAL (CONST_VECTOR_ELT (x, i + j)) != 0)
value |= 1 << (j * elt_bits);
{
auto elt = INTVAL (CONST_VECTOR_ELT (x, i + j));
value |= (elt & mask) << (j * elt_bits);
}
output_constant_pool_2 (int_mode, gen_int_mode (value, int_mode),
i != 0 ? MIN (align, int_bits) : align);
}