[AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic

This patch uses predicated MLA, MLS, MAD and MSB to implement
conditional "FMA"s on integers.  This also requires providing
the unpredicated optabs (fma and fnma) since otherwise
tree-ssa-math-opts.c won't try to use the conditional forms.

We still want to use shifts and adds in preference to multiplications,
so the patch makes the optab expanders check for that.

The tests cover floating-point types too, which are already handled,
and which were already tested to some extent by gcc.dg/vect.

2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
	(aarch64_prepare_sve_cond_int_fma): Declare.
	* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
	(aarch64_prepare_sve_int_fma): New functions.
	(aarch64_prepare_sve_cond_int_fma): Likewise.
	* config/aarch64/aarch64-sve.md
	(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
	(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
	(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
	(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
	(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
	(*madd<mode>): Rename to...
	(*fma<mode>4): ...this.
	(*msub<mode>): Rename to...
	(*fnma<mode>4): ...this.

gcc/testsuite/
	* gcc.target/aarch64/sve/cond_mla_1.c: New test.
	* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.

Co-Authored-By: Kugan Vivekanandarajah <kuganv@linaro.org>

From-SVN: r274509
This commit is contained in:
Richard Sandiford 2019-08-15 08:22:07 +00:00 committed by Richard Sandiford
parent a19ba9e1b1
commit b6c3aea189
21 changed files with 1082 additions and 3 deletions

View File

@ -1,3 +1,22 @@
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
(aarch64_prepare_sve_cond_int_fma): Declare.
* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
(aarch64_prepare_sve_int_fma): New functions.
(aarch64_prepare_sve_cond_int_fma): Likewise.
* config/aarch64/aarch64-sve.md
(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
(*madd<mode>): Rename to...
(*fma<mode>4): ...this.
(*msub<mode>): Rename to...
(*fnma<mode>4): ...this.
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>

View File

@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
#endif /* RTX_CODE */
void aarch64_init_builtins (void);

View File

@ -1844,7 +1844,7 @@
)
;; Predicated integer operations with merging.
(define_expand "cond_<optab><mode>"
(define_expand "@cond_<optab><mode>"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand")
@ -3384,8 +3384,26 @@
;; - MLA
;; -------------------------------------------------------------------------
;; Unpredicated integer addition of product.
(define_expand "fma<mode>4"
[(set (match_operand:SVE_I 0 "register_operand")
(plus:SVE_I
(unspec:SVE_I
[(match_dup 4)
(mult:SVE_I (match_operand:SVE_I 1 "register_operand")
(match_operand:SVE_I 2 "nonmemory_operand"))]
UNSPEC_PRED_X)
(match_operand:SVE_I 3 "register_operand")))]
"TARGET_SVE"
{
if (aarch64_prepare_sve_int_fma (operands, PLUS))
DONE;
operands[4] = aarch64_ptrue_reg (<VPRED>mode);
}
)
;; Predicated integer addition of product.
(define_insn "*madd<mode>"
(define_insn "*fma<mode>4"
[(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
(plus:SVE_I
(unspec:SVE_I
@ -3402,6 +3420,97 @@
[(set_attr "movprfx" "*,*,yes")]
)
;; Predicated integer addition of product with merging.
(define_expand "cond_fma<mode>"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand")
(plus:SVE_I
(mult:SVE_I (match_operand:SVE_I 2 "register_operand")
(match_operand:SVE_I 3 "general_operand"))
(match_operand:SVE_I 4 "register_operand"))
(match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE"
{
if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
DONE;
/* Swap the multiplication operands if the fallback value is the
second of the two. */
if (rtx_equal_p (operands[3], operands[5]))
std::swap (operands[2], operands[3]);
}
)
;; Predicated integer addition of product, merging with the first input.
(define_insn "*cond_fma<mode>_2"
[(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
(plus:SVE_I
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
(match_operand:SVE_I 3 "register_operand" "w, w"))
(match_operand:SVE_I 4 "register_operand" "w, w"))
(match_dup 2)]
UNSPEC_SEL))]
"TARGET_SVE"
"@
mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
[(set_attr "movprfx" "*,yes")]
)
;; Predicated integer addition of product, merging with the third input.
(define_insn "*cond_fma<mode>_4"
[(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
(plus:SVE_I
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
(match_operand:SVE_I 3 "register_operand" "w, w"))
(match_operand:SVE_I 4 "register_operand" "0, w"))
(match_dup 4)]
UNSPEC_SEL))]
"TARGET_SVE"
"@
mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
[(set_attr "movprfx" "*,yes")]
)
;; Predicated integer addition of product, merging with an independent value.
(define_insn_and_rewrite "*cond_fma<mode>_any"
[(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(plus:SVE_I
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
(match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
(match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
(match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
UNSPEC_SEL))]
"TARGET_SVE
&& !rtx_equal_p (operands[2], operands[5])
&& !rtx_equal_p (operands[3], operands[5])
&& !rtx_equal_p (operands[4], operands[5])"
"@
movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
#"
"&& reload_completed
&& register_operand (operands[5], <MODE>mode)
&& !rtx_equal_p (operands[0], operands[5])"
{
emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
operands[5], operands[1]));
operands[5] = operands[4] = operands[0];
}
[(set_attr "movprfx" "yes")]
)
;; -------------------------------------------------------------------------
;; ---- [INT] MLS and MSB
;; -------------------------------------------------------------------------
@ -3410,8 +3519,26 @@
;; - MSB
;; -------------------------------------------------------------------------
;; Unpredicated integer subtraction of product.
(define_expand "fnma<mode>4"
[(set (match_operand:SVE_I 0 "register_operand")
(minus:SVE_I
(match_operand:SVE_I 3 "register_operand")
(unspec:SVE_I
[(match_dup 4)
(mult:SVE_I (match_operand:SVE_I 1 "register_operand")
(match_operand:SVE_I 2 "general_operand"))]
UNSPEC_PRED_X)))]
"TARGET_SVE"
{
if (aarch64_prepare_sve_int_fma (operands, MINUS))
DONE;
operands[4] = aarch64_ptrue_reg (<VPRED>mode);
}
)
;; Predicated integer subtraction of product.
(define_insn "*msub<mode>3"
(define_insn "*fnma<mode>3"
[(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand" "w, 0, w")
@ -3428,6 +3555,98 @@
[(set_attr "movprfx" "*,*,yes")]
)
;; Predicated integer subtraction of product with merging.
(define_expand "cond_fnma<mode>"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand")
(mult:SVE_I (match_operand:SVE_I 2 "register_operand")
(match_operand:SVE_I 3 "general_operand")))
(match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE"
{
if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
DONE;
/* Swap the multiplication operands if the fallback value is the
second of the two. */
if (rtx_equal_p (operands[3], operands[5]))
std::swap (operands[2], operands[3]);
}
)
;; Predicated integer subtraction of product, merging with the first input.
(define_insn "*cond_fnma<mode>_2"
[(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand" "w, w")
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
(match_operand:SVE_I 3 "register_operand" "w, w")))
(match_dup 2)]
UNSPEC_SEL))]
"TARGET_SVE"
"@
msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
[(set_attr "movprfx" "*,yes")]
)
;; Predicated integer subtraction of product, merging with the third input.
(define_insn "*cond_fnma<mode>_4"
[(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand" "0, w")
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
(match_operand:SVE_I 3 "register_operand" "w, w")))
(match_dup 4)]
UNSPEC_SEL))]
"TARGET_SVE"
"@
mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
[(set_attr "movprfx" "*,yes")]
)
;; Predicated integer subtraction of product, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_fnma<mode>_any"
[(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
(mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
(match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
(match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
UNSPEC_SEL))]
"TARGET_SVE
&& !rtx_equal_p (operands[2], operands[5])
&& !rtx_equal_p (operands[3], operands[5])
&& !rtx_equal_p (operands[4], operands[5])"
"@
movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
#"
"&& reload_completed
&& register_operand (operands[5], <MODE>mode)
&& !rtx_equal_p (operands[0], operands[5])"
{
emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
operands[5], operands[1]));
operands[5] = operands[4] = operands[0];
}
[(set_attr "movprfx" "yes")]
)
;; -------------------------------------------------------------------------
;; ---- [INT] Dot product
;; -------------------------------------------------------------------------

View File

@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
}
/* Check whether VALUE is a vector constant in which every element
is either a power of 2 or a negated power of 2. If so, return
a constant vector of log2s, and flip CODE between PLUS and MINUS
if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
static rtx
aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
{
if (GET_CODE (value) != CONST_VECTOR)
return NULL_RTX;
rtx_vector_builder builder;
if (!builder.new_unary_operation (GET_MODE (value), value, false))
return NULL_RTX;
scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
/* 1 if the result of the multiplication must be negated,
0 if it mustn't, or -1 if we don't yet care. */
int negate = -1;
unsigned int encoded_nelts = const_vector_encoded_nelts (value);
for (unsigned int i = 0; i < encoded_nelts; ++i)
{
rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
if (!CONST_SCALAR_INT_P (elt))
return NULL_RTX;
rtx_mode_t val (elt, int_mode);
wide_int pow2 = wi::neg (val);
if (val != pow2)
{
/* It matters whether we negate or not. Make that choice,
and make sure that it's consistent with previous elements. */
if (negate == !wi::neg_p (val))
return NULL_RTX;
negate = wi::neg_p (val);
if (!negate)
pow2 = val;
}
/* POW2 is now the value that we want to be a power of 2. */
int shift = wi::exact_log2 (pow2);
if (shift < 0)
return NULL_RTX;
builder.quick_push (gen_int_mode (shift, int_mode));
}
if (negate == -1)
/* PLUS and MINUS are equivalent; canonicalize on PLUS. */
code = PLUS;
else if (negate == 1)
code = code == PLUS ? MINUS : PLUS;
return builder.build ();
}
/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
operands array, in the same order as for fma_optab. Return true if
the function emitted all the necessary instructions, false if the caller
should generate the pattern normally with the new OPERANDS array. */
bool
aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
{
machine_mode mode = GET_MODE (operands[0]);
if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
{
rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
NULL_RTX, true, OPTAB_DIRECT);
force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
operands[3], product, operands[0], true,
OPTAB_DIRECT);
return true;
}
operands[2] = force_reg (mode, operands[2]);
return false;
}
/* Likewise, but for a conditional pattern. */
bool
aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
{
machine_mode mode = GET_MODE (operands[0]);
if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
{
rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
NULL_RTX, true, OPTAB_DIRECT);
emit_insn (gen_cond (code, mode, operands[0], operands[1],
operands[4], product, operands[5]));
return true;
}
operands[3] = force_reg (mode, operands[3]);
return false;
}
static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask (machine_mode mode)
{

View File

@ -1,3 +1,23 @@
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
* gcc.target/aarch64/sve/cond_mla_1.c: New test.
* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>

View File

@ -0,0 +1,52 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i]; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,35 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_1.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i]; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,53 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,36 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_2.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = (pred[i] != 1 \
? a[i] OP b[i] * (TYPE) FACTOR \
: (TYPE) FACTOR); \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,52 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i]; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,35 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_3.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i]; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,56 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i]; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,36 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_4.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = (pred[i] == 1 \
? a[i] OP b[i] * (TYPE) FACTOR \
: pred[i]); \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,56 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] ? a[i] OP b[i] * c : 0; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,35 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_5.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,53 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, TYPE c, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] ? a[i] OP b[i] * c : 5; \
}
#define TEST_TYPE(T, TYPE) \
T (TYPE, add, +) \
T (TYPE, sub, -)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t) \
TEST_TYPE (T, uint16_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, _Float16) \
TEST_TYPE (T, float) \
TEST_TYPE (T, double)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */

View File

@ -0,0 +1,35 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_6.c"
#define FACTOR 17
#define N 99
#define TEST_LOOP(TYPE, NAME, OP) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,62 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP, CONST) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i]; \
}
#define TEST_COUNT(T, TYPE, CONST) \
T (TYPE, add, +, CONST) \
T (TYPE, sub, -, CONST)
#define TEST_TYPE(T, TYPE, CONST) \
TEST_COUNT (T, TYPE, 2) \
TEST_COUNT (T, TYPE, 4) \
TEST_COUNT (T, TYPE, CONST)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t, 0x80) \
TEST_TYPE (T, uint16_t, 0x8000) \
TEST_TYPE (T, uint32_t, 0x80000000) \
TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,34 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_7.c"
#define N 99
#define TEST_LOOP(TYPE, NAME, OP, CONST) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] != 1 ? a[i] OP b[i] * CONST : a[i]; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}

View File

@ -0,0 +1,62 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_LOOP(TYPE, NAME, OP, CONST) \
void __attribute__ ((noipa)) \
test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, \
TYPE *__restrict a, \
TYPE *__restrict b, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i]; \
}
#define TEST_COUNT(T, TYPE, CONST) \
T (TYPE, add, +, CONST) \
T (TYPE, sub, -, CONST)
#define TEST_TYPE(T, TYPE, CONST) \
TEST_COUNT (T, TYPE, 2) \
TEST_COUNT (T, TYPE, 4) \
TEST_COUNT (T, TYPE, CONST)
#define TEST_ALL(T) \
TEST_TYPE (T, uint8_t, 0x80) \
TEST_TYPE (T, uint16_t, 0x8000) \
TEST_TYPE (T, uint32_t, 0x80000000) \
TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View File

@ -0,0 +1,34 @@
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_mla_8.c"
#define N 99
#define TEST_LOOP(TYPE, NAME, OP, CONST) \
{ \
TYPE r[N], a[N], b[N], pred[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (i & 1 ? i : 3 * i); \
b[i] = (i >> 4) << (i & 15); \
pred[i] = i % 3 < i % 5; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected \
= pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i]; \
if (r[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL (TEST_LOOP)
return 0;
}