[AArch64] Improve arm_neon.h vml<as>_lane handling.

gcc/ * config/aarch64/aarch64-simd-builtins.def (fma): New. * config/aarch64/aarch64-simd.md (aarch64_mla_elt<mode>): New. (aarch64_mla_elt_<vswap_width_name><mode>): Likewise. (aarch64_mls_elt<mode>): Likewise. (aarch64_mls_elt_<vswap_width_name><mode>): Likewise. (aarch64_fma4_elt<mode>): Likewise. (aarch64_fma4_elt_<vswap_width_name><mode>): Likewise. (aarch64_fma4_elt_to_128v2df): Likewise. (aarch64_fma4_elt_to_64df): Likewise. (fnma<mode>4): Likewise. (aarch64_fnma4_elt<mode>): Likewise. (aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise. (aarch64_fnma4_elt_to_128v2df): Likewise. (aarch64_fnma4_elt_to_64df): Likewise. * config/aarch64/iterators.md (VDQSF): New. * config/aarch64/arm_neon.h (vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation. (vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/fmla-intrinsic.c: New. * gcc.target/aarch64/mla-intrinsic.c: Likewise. * gcc.target/aarch64/fmls-intrinsic.c: Likewise. * gcc.target/aarch64/mls-intrinsic.c: Likewise. From-SVN: r202625
2013-09-16 09:53:11 +00:00 · 2013-09-16 09:53:11 +00:00 · 828e70c1d7
commit 828e70c1d7
parent 779aea46cc
10 changed files with 1137 additions and 561 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,25 @@
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* config/aarch64/aarch64-simd-builtins.def (fma): New.
+	* config/aarch64/aarch64-simd.md
+	(aarch64_mla_elt<mode>): New.
+	(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
+	(aarch64_mls_elt<mode>): Likewise.
+	(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
+	(aarch64_fma4_elt<mode>): Likewise.
+	(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
+	(aarch64_fma4_elt_to_128v2df): Likewise.
+	(aarch64_fma4_elt_to_64df): Likewise.
+	(fnma<mode>4): Likewise.
+	(aarch64_fnma4_elt<mode>): Likewise.
+	(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
+	(aarch64_fnma4_elt_to_128v2df): Likewise.
+	(aarch64_fnma4_elt_to_64df): Likewise.
+	* config/aarch64/iterators.md (VDQSF): New.
+	* config/aarch64/arm_neon.h
+	(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
+	(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.
+
 2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>

 	* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@ -359,3 +359,6 @@
  /* Implemented by aarch64_st1<VALL:mode>.  */
  BUILTIN_VALL (STORE1, st1, 0)

+  /* Implemented by fma<mode>4.  */
+  BUILTIN_VDQF (TERNOP, fma, 4)
+
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -1070,6 +1070,38 @@
   (set_attr "simd_mode" "<MODE>")]
 )

+(define_insn "*aarch64_mla_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (plus:VDQHS
+	 (mult:VDQHS
+	   (vec_duplicate:VDQHS
+	      (vec_select:<VEL>
+		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		  (parallel [(match_operand:SI 2 "immediate_operand")])))
+	   (match_operand:VDQHS 3 "register_operand" "w"))
+	 (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (plus:VDQHS
+	 (mult:VDQHS
+	   (vec_duplicate:VDQHS
+	      (vec_select:<VEL>
+		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		  (parallel [(match_operand:SI 2 "immediate_operand")])))
+	   (match_operand:VDQHS 3 "register_operand" "w"))
+	 (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 (define_insn "aarch64_mls<mode>"
 [(set (match_operand:VQ_S 0 "register_operand" "=w")
       (minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0")
@ -1081,6 +1113,38 @@
   (set_attr "simd_mode" "<MODE>")]
 )

+(define_insn "*aarch64_mls_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (minus:VDQHS
+	 (match_operand:VDQHS 4 "register_operand" "0")
+	 (mult:VDQHS
+	   (vec_duplicate:VDQHS
+	      (vec_select:<VEL>
+		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		  (parallel [(match_operand:SI 2 "immediate_operand")])))
+	   (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (minus:VDQHS
+	 (match_operand:VDQHS 4 "register_operand" "0")
+	 (mult:VDQHS
+	   (vec_duplicate:VDQHS
+	      (vec_select:<VEL>
+		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		  (parallel [(match_operand:SI 2 "immediate_operand")])))
+	   (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 ;; Max/Min operations.
 (define_insn "<su><maxmin><mode>3"
 [(set (match_operand:VQ_S 0 "register_operand" "=w")
@ -1483,6 +1547,137 @@
   (set_attr "simd_mode" "<MODE>")]
 )

+(define_insn "*aarch64_fma4_elt<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+    (fma:VDQF
+      (vec_duplicate:VDQF
+	(vec_select:<VEL>
+	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQF 3 "register_operand" "w")
+      (match_operand:VDQF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+    (fma:VDQSF
+      (vec_duplicate:VDQSF
+	(vec_select:<VEL>
+	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQSF 3 "register_operand" "w")
+      (match_operand:VDQSF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_128df"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+    (fma:V2DF
+      (vec_duplicate:V2DF
+	  (match_operand:DF 1 "register_operand" "w"))
+      (match_operand:V2DF 2 "register_operand" "w")
+      (match_operand:V2DF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_64v2df"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+    (fma:DF
+	(vec_select:DF
+	  (match_operand:V2DF 1 "register_operand" "w")
+	  (parallel [(match_operand:SI 2 "immediate_operand")]))
+      (match_operand:DF 3 "register_operand" "w")
+      (match_operand:DF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.2d, %3.2d, %1.2d[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "fnma<mode>4"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+	(fma:VDQF
+	  (match_operand:VDQF 1 "register_operand" "w")
+          (neg:VDQF
+	    (match_operand:VDQF 2 "register_operand" "w"))
+	  (match_operand:VDQF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+ "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_fmla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+    (fma:VDQF
+      (neg:VDQF
+        (match_operand:VDQF 3 "register_operand" "w"))
+      (vec_duplicate:VDQF
+	(vec_select:<VEL>
+	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+    (fma:VDQSF
+      (neg:VDQSF
+        (match_operand:VDQSF 3 "register_operand" "w"))
+      (vec_duplicate:VDQSF
+	(vec_select:<VEL>
+	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQSF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_128df"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+    (fma:V2DF
+      (neg:V2DF
+        (match_operand:V2DF 2 "register_operand" "w"))
+      (vec_duplicate:V2DF
+	(match_operand:DF 1 "register_operand" "w"))
+      (match_operand:V2DF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_64v2df"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+    (fma:DF
+      (vec_select:DF
+	(match_operand:V2DF 1 "register_operand" "w")
+	(parallel [(match_operand:SI 2 "immediate_operand")]))
+      (neg:DF
+        (match_operand:DF 3 "register_operand" "w"))
+      (match_operand:DF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.2d, %3.2d, %1.2d[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
 ;; Vector versions of the floating-point frint patterns.
 ;; Expands to btrunc, ceil, floor, nearbyint, rint, round.
 (define_insn "<frint_pattern><mode>2"
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -89,6 +89,9 @@
 ;; Vector Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])

+;; Vector single Float modes.
+(define_mode_iterator VDQSF [V2SF V4SF])
+
 ;; Modes suitable to use as the return type of a vcond expression.
 (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,10 @@
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* gcc.target/aarch64/fmla-intrinsic.c: New.
+	* gcc.target/aarch64/mla-intrinsic.c: Likewise.
+	* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
+	* gcc.target/aarch64/mls-intrinsic.c: Likewise.
+
 2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>

 	* gcc.target/aarch64/mul_intrinsic_1.c: New.
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@ -0,0 +1,116 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+#define DELTA 0.0001
+
+extern double fabs (double);
+
+extern void abort (void);
+
+#define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes)			\
+static void								\
+test_vfma##q1##_lane##q2##_f##size (float##size##_t * res,		\
+				   const float##size##_t *in1,		\
+				   const float##size##_t *in2)		\
+{									\
+  float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res);		\
+  float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1);		\
+  float##size##x##in2_lanes##_t c;					\
+  if (in2_lanes > 1)							\
+    {									\
+      c = vld1##q2##_f##size (in2);					\
+      a = vfma##q1##_lane##q2##_f##size (a, b, c, 1);			\
+    }									\
+  else									\
+    {									\
+      c = vld1##q2##_f##size (in2 + 1);					\
+      a = vfma##q1##_lane##q2##_f##size (a, b, c, 0);			\
+    }									\
+  vst1##q1##_f##size (res, a);						\
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)		\
+TEST_VMLA ( ,  , width, n_half_lanes, n_half_lanes)		\
+TEST_VMLA (q,  , width, n_lanes, n_half_lanes)			\
+TEST_VMLA ( , q, width, n_half_lanes, n_lanes)			\
+TEST_VMLA (q, q, width, n_lanes, n_lanes)			\
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (64, 2, 1)
+
+#define POOL2 {0.0, 1.0}
+#define POOL4 {0.0, 1.0, 2.0, 3.0}
+#define EMPTY2 {0.0, 0.0}
+#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
+
+#define BUILD_TEST(size, lanes)					\
+static void							\
+test_f##size (void)						\
+{								\
+  int i;							\
+  float##size##_t pool[lanes] = POOL##lanes;			\
+  float##size##_t res[lanes] = EMPTY##lanes;			\
+  float##size##_t res2[lanes] = EMPTY##lanes;			\
+  float##size##_t res3[lanes] = EMPTY##lanes;			\
+  float##size##_t res4[lanes] = EMPTY##lanes;			\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vfma_lane_f##size (res, pool, pool);			\
+  for (i = 0; i < lanes / 2; i++)				\
+    if (fabs (res[i] - pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vfmaq_lane_f##size (res2, pool, pool);			\
+  for (i = 0; i < lanes; i++)					\
+    if (fabs (res2[i] - pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vfma_laneq_f##size (res3, pool, pool);			\
+  for (i = 0; i < lanes / 2; i++)				\
+    if (fabs (res3[i] - pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vfmaq_laneq_f##size (res4, pool, pool);			\
+  for (i = 0; i < lanes; i++)					\
+    if (fabs (res4[i] - pool[i]) > DELTA)			\
+      abort ();							\
+}
+
+BUILD_TEST (32, 4)
+BUILD_TEST (64, 2)
+
+int
+main (int argc, char **argv)
+{
+  test_f32 ();
+  test_f64 ();
+  return 0;
+}
+
+/* vfma_laneq_f32.
+   vfma_lane_f32.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfmaq_lane_f32.
+   vfmaq_laneq_f32.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfma_lane_f64.  */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+
+/* vfmaq_lane_f64.
+   vfma_laneq_f64.
+   vfmaq_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */
+
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@ -0,0 +1,117 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+#define DELTA 0.0001
+
+extern double fabs (double);
+
+extern void abort (void);
+
+#define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes)			\
+static void								\
+test_vfms##q1##_lane##q2##_f##size (float##size##_t * res,		\
+				   const float##size##_t *in1,		\
+				   const float##size##_t *in2)		\
+{									\
+  float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res);		\
+  float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1);		\
+  float##size##x##in2_lanes##_t c;					\
+  if (in2_lanes > 1)							\
+    {									\
+      c = vld1##q2##_f##size (in2);					\
+      a = vfms##q1##_lane##q2##_f##size (a, b, c, 1);			\
+    }									\
+  else									\
+    {									\
+      c = vld1##q2##_f##size (in2 + 1);					\
+      a = vfms##q1##_lane##q2##_f##size (a, b, c, 0);			\
+    }									\
+  vst1##q1##_f##size (res, a);						\
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)		\
+TEST_VMLS ( ,  , width, n_half_lanes, n_half_lanes)		\
+TEST_VMLS (q,  , width, n_lanes, n_half_lanes)			\
+TEST_VMLS ( , q, width, n_half_lanes, n_lanes)			\
+TEST_VMLS (q, q, width, n_lanes, n_lanes)			\
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (64, 2, 1)
+
+#define POOL2 {0.0, 1.0}
+#define POOL4 {0.0, 1.0, 2.0, 3.0}
+#define EMPTY2 {0.0, 0.0}
+#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
+
+#define BUILD_TEST(size, lanes)					\
+static void							\
+test_f##size (void)						\
+{								\
+  int i;							\
+  float##size##_t pool[lanes] = POOL##lanes;			\
+  float##size##_t res[lanes] = EMPTY##lanes;			\
+  float##size##_t res2[lanes] = EMPTY##lanes;			\
+  float##size##_t res3[lanes] = EMPTY##lanes;			\
+  float##size##_t res4[lanes] = EMPTY##lanes;			\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vfms_lane_f##size (res, pool, pool);			\
+  asm volatile ("" : :"Q" (res) : "memory");			\
+  for (i = 0; i < lanes / 2; i++)				\
+    if (fabs (res[i] + pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  test_vfmsq_lane_f##size (res2, pool, pool);			\
+  asm volatile ("" : :"Q" (res2) : "memory");			\
+  for (i = 0; i < lanes; i++)					\
+    if (fabs (res2[i] + pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  test_vfms_laneq_f##size (res3, pool, pool);			\
+  asm volatile ("" : :"Q" (res3) : "memory");			\
+  for (i = 0; i < lanes / 2; i++)				\
+    if (fabs (res3[i] + pool[i]) > DELTA)			\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  test_vfmsq_laneq_f##size (res4, pool, pool);			\
+  asm volatile ("" : :"Q" (res4) : "memory");			\
+  for (i = 0; i < lanes; i++)					\
+    if (fabs (res4[i] + pool[i]) > DELTA)			\
+      abort ();							\
+}
+
+BUILD_TEST (32, 4)
+BUILD_TEST (64, 2)
+
+int
+main (int argc, char **argv)
+{
+  test_f32 ();
+  test_f64 ();
+  return 0;
+}
+
+/* vfms_laneq_f32.
+   vfms_lane_f32.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfmsq_lane_f32.
+   vfmsq_laneq_f32.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfms_lane_f64.  */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+
+/* vfmsq_lane_f64.
+   vfms_laneq_f64.
+   vfmsq_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */
+
--- a/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define MAPs(size, xx) int##size##xx##_t
+#define MAPu(size, xx) uint##size##xx##_t
+
+
+#define TEST_VMLA(q, su, size, in1_lanes, in2_lanes)		\
+static void							\
+test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res,	\
+				 const MAP##su(size, ) *in1,	\
+				 const MAP##su(size, ) *in2)	\
+{								\
+  MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res);	\
+  MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1);	\
+  MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2);	\
+  a = vmlaq_lane##q##_##su##size (a, b, c, 1);			\
+  vst1q_##su##size (res, a);					\
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)		\
+TEST_VMLA (, s, width, n_lanes, n_half_lanes)			\
+TEST_VMLA (q, s, width, n_lanes, n_lanes)			\
+TEST_VMLA (, u, width, n_lanes, n_half_lanes)			\
+TEST_VMLA (q, u, width, n_lanes, n_lanes)			\
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (16, 8, 4)
+
+#define POOL4 {0, 1, 2, 3}
+#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
+#define EMPTY4 {0, 0, 0, 0}
+#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
+
+#define BUILD_TEST(su, size, lanes)				\
+static void							\
+test_##su##size (void)						\
+{								\
+  int i;							\
+  MAP##su (size,) pool[lanes] = POOL##lanes;			\
+  MAP##su (size,) res[lanes] = EMPTY##lanes;			\
+  MAP##su (size,) res2[lanes] = EMPTY##lanes;			\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vmlaq_lane_##su##size (res, pool, pool);			\
+  for (i = 0; i < lanes; i++)					\
+    if (res[i] != pool[i])					\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vmlaq_laneq_##su##size (res2, pool, pool);		\
+  for (i = 0; i < lanes; i++)					\
+    if (res2[i] != pool[i])					\
+      abort ();							\
+}
+
+#undef BUILD_VARS
+#define BUILD_VARS(size, lanes)					\
+BUILD_TEST (s, size, lanes)					\
+BUILD_TEST (u, size, lanes)
+
+BUILD_VARS (32, 4)
+BUILD_VARS (16, 8)
+
+int
+main (int argc, char **argv)
+{
+  test_s32 ();
+  test_u32 ();
+  test_s16 ();
+  test_u16 ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
+
--- a/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
@ -0,0 +1,89 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define MAPs(size, xx) int##size##xx##_t
+#define MAPu(size, xx) uint##size##xx##_t
+
+
+#define TEST_VMLS(q, su, size, in1_lanes, in2_lanes)		\
+static void							\
+test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res,	\
+				 const MAP##su(size, ) *in1,	\
+				 const MAP##su(size, ) *in2)	\
+{								\
+  MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res);	\
+  MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1);	\
+  MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2);	\
+  a = vmlsq_lane##q##_##su##size (a, b, c, 1);			\
+  vst1q_##su##size (res, a);					\
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)		\
+TEST_VMLS (, s, width, n_lanes, n_half_lanes)			\
+TEST_VMLS (q, s, width, n_lanes, n_lanes)			\
+TEST_VMLS (, u, width, n_lanes, n_half_lanes)			\
+TEST_VMLS (q, u, width, n_lanes, n_lanes)			\
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (16, 8, 4)
+
+#define MAP_OPs +
+#define MAP_OPu -
+
+#define POOL4 {0, 1, 2, 3}
+#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
+#define EMPTY4s {0, 0, 0, 0}
+#define EMPTY8s {0, 0, 0, 0, 0, 0, 0, 0}
+#define EMPTY4u {0, 2, 4, 6}
+#define EMPTY8u {0, 2, 4, 6, 8, 10, 12, 14}
+
+#define BUILD_TEST(su, size, lanes)				\
+static void							\
+test_##su##size (void)						\
+{								\
+  int i;							\
+  MAP##su (size,) pool[lanes] = POOL##lanes;			\
+  MAP##su (size,) res[lanes] = EMPTY##lanes##su;		\
+  MAP##su (size,) res2[lanes] = EMPTY##lanes##su;		\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vmlsq_lane_##su##size (res, pool, pool);			\
+  for (i = 0; i < lanes; i++)					\
+    if (res[i] MAP_OP##su pool[i] != 0)				\
+      abort ();							\
+								\
+  /* Forecfully avoid optimization.  */				\
+  asm volatile ("" : : : "memory");				\
+  test_vmlsq_laneq_##su##size (res2, pool, pool);		\
+  for (i = 0; i < lanes; i++)					\
+    if (res2[i] MAP_OP##su pool[i] != 0)			\
+      abort ();							\
+}
+
+#undef BUILD_VARS
+#define BUILD_VARS(size, lanes)					\
+BUILD_TEST (s, size, lanes)					\
+BUILD_TEST (u, size, lanes)
+
+BUILD_VARS (32, 4)
+BUILD_VARS (16, 8)
+
+int
+main (int argc, char **argv)
+{
+  test_s32 ();
+  test_u32 ();
+  test_s16 ();
+  test_u16 ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
+