[AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic

This patch uses predicated MLA, MLS, MAD and MSB to implement conditional "FMA"s on integers. This also requires providing the unpredicated optabs (fma and fnma) since otherwise tree-ssa-math-opts.c won't try to use the conditional forms. We still want to use shifts and adds in preference to multiplications, so the patch makes the optab expanders check for that. The tests cover floating-point types too, which are already handled, and which were already tested to some extent by gcc.dg/vect. 2019-08-15 Richard Sandiford <richard.sandiford@arm.com> Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> gcc/ * config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma) (aarch64_prepare_sve_cond_int_fma): Declare. * config/aarch64/aarch64.c (aarch64_convert_mult_to_shift) (aarch64_prepare_sve_int_fma): New functions. (aarch64_prepare_sve_cond_int_fma): Likewise. * config/aarch64/aarch64-sve.md (cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker. (fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2) (*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4) (cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2) (*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns. (*madd<mode>): Rename to... (*fma<mode>4): ...this. (*msub<mode>): Rename to... (*fnma<mode>4): ...this. gcc/testsuite/ * gcc.target/aarch64/sve/cond_mla_1.c: New test. * gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_2.c: Likewise. * gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_3.c: Likewise. * gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_4.c: Likewise. * gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_5.c: Likewise. * gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_6.c: Likewise. * gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_7.c: Likewise. * gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise. * gcc.target/aarch64/sve/cond_mla_8.c: Likewise. * gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise. Co-Authored-By: Kugan Vivekanandarajah <kuganv@linaro.org> From-SVN: r274509
2019-08-15 08:22:07 +00:00 · 2019-08-15 08:22:07 +00:00 · b6c3aea189
parent a19ba9e1b1
commit b6c3aea189
21 changed files with 1082 additions and 3 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,22 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+	* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
+	(aarch64_prepare_sve_cond_int_fma): Declare.
+	* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
+	(aarch64_prepare_sve_int_fma): New functions.
+	(aarch64_prepare_sve_cond_int_fma): Likewise.
+	* config/aarch64/aarch64-sve.md
+	(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
+	(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
+	(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
+	(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
+	(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
+	(*madd<mode>): Rename to...
+	(*fma<mode>4): ...this.
+	(*msub<mode>): Rename to...
+	(*fnma<mode>4): ...this.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
 	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
 void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
 #endif /* RTX_CODE */

 void aarch64_init_builtins (void);
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@ -1844,7 +1844,7 @@
 )

 ;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
  [(set (match_operand:SVE_I 0 "register_operand")
 	(unspec:SVE_I
 	  [(match_operand:<VPRED> 1 "register_operand")
@ -3384,8 +3384,26 @@
 ;; - MLA
 ;; -------------------------------------------------------------------------

+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(plus:SVE_I
+	  (unspec:SVE_I
+	    [(match_dup 4)
+	     (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+			 (match_operand:SVE_I 2 "nonmemory_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_I 3 "register_operand")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(plus:SVE_I
 	  (unspec:SVE_I
@ -3402,6 +3420,97 @@
  [(set_attr "movprfx" "*,*,yes")]
 )

+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+			 (match_operand:SVE_I 3 "general_operand"))
+	     (match_operand:SVE_I 4 "register_operand"))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "w, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "0, w"))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] MLS and MSB
 ;; -------------------------------------------------------------------------
@ -3410,8 +3519,26 @@
 ;; - MSB
 ;; -------------------------------------------------------------------------

+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(minus:SVE_I
+	  (match_operand:SVE_I 3 "register_operand")
+	  (unspec:SVE_I
+	    [(match_dup 4)
+	     (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+			 (match_operand:SVE_I 2 "general_operand"))]
+	    UNSPEC_PRED_X)))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(minus:SVE_I
 	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")
@ -3428,6 +3555,98 @@
  [(set_attr "movprfx" "*,*,yes")]
 )

+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+   (unspec:SVE_I
+	[(match_operand:<VPRED> 1 "register_operand")
+	 (minus:SVE_I
+	   (match_operand:SVE_I 4 "register_operand")
+	   (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+		       (match_operand:SVE_I 3 "general_operand")))
+	 (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+	UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "w, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w")))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "0, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w")))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Dot product
 ;; -------------------------------------------------------------------------
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
    aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }

+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+	return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+	{
+	  /* It matters whether we negate or not.  Make that choice,
+	     and make sure that it's consistent with previous elements.  */
+	  if (negate == !wi::neg_p (val))
+	    return NULL_RTX;
+	  negate = wi::neg_p (val);
+	  if (!negate)
+	    pow2 = val;
+	}
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+	return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+			  operands[3], product, operands[0], true,
+			  OPTAB_DIRECT);
+      return true;
+    }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
+
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+			   operands[4], product, operands[5]));
+      return true;
+    }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
+}
+
 static unsigned HOST_WIDE_INT
 aarch64_shift_truncation_mask (machine_mode mode)
 {
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,23 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+	* gcc.target/aarch64/sve/cond_mla_1.c: New test.
+	* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
 	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] != 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : (TYPE) FACTOR);			\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] == 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : pred[i]);				\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}