[AArch64] Rewrite the vdup_lane intrinsics in C

gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane<mode>): Allow for VALL. (aarch64_dup_lane_scalar<mode>): Remove. (aarch64_dup_lane_<vswap_width_name><mode>): New. (aarch64_get_lane_signed<mode>): Add w->w altenative. (aarch64_get_lane_unsigned<mode>): Likewise. (aarch64_get_lane<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. (vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation. (vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers. From-SVN: r202180
2013-09-02 16:22:10 +00:00 · 2013-09-02 16:22:10 +00:00 · 91bd4114a7
commit 91bd4114a7
parent d617d2d806
7 changed files with 887 additions and 602 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,26 @@
+2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* config/aarch64/aarch64-simd-builtins.def
+	(dup_lane_scalar): Remove.
+	* config/aarch64/aarch64-simd.md
+	(aarch64_simd_dup): Add 'w->w' alternative.
+	(aarch64_dup_lane<mode>): Allow for VALL.
+	(aarch64_dup_lane_scalar<mode>): Remove.
+	(aarch64_dup_lane_<vswap_width_name><mode>): New.
+	(aarch64_get_lane_signed<mode>): Add w->w altenative.
+	(aarch64_get_lane_unsigned<mode>): Likewise.
+	(aarch64_get_lane<mode>): Likewise.
+	* config/aarch64/aarch64.c (aarch64_evpc_dup): New.
+	(aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup.
+	* config/aarch64/iterators.md (VSWAP_WIDTH): New.
+	(VCON): Change container of V2SF.
+	(vswap_width_name): Likewise.
+	* config/aarch64/arm_neon.h
+	(__aarch64_vdup_lane_any): New.
+	(__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
+	(vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation.
+	(vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
+
 2013-09-02  Eric Botcazou  <ebotcazou@adacore.com>

 	PR middle-end/56382
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -336,24 +336,13 @@
 })

 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQ 0 "register_operand" "=w")
-        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r")))]
+  [(set (match_operand:VDQ 0 "register_operand" "=w, w")
+        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r, w")))]
  "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %<vw>1"
-  [(set_attr "simd_type" "simd_dupgp")
-   (set_attr "simd_mode" "<MODE>")]
-)
-
-(define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (vec_duplicate:VDQ_I
-	  (vec_select:<VEL>
-	    (match_operand:<VCON> 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
-          )))]
-  "TARGET_SIMD"
-  "dup\\t%<v>0<Vmtype>, %1.<Vetype>[%2]"
-  [(set_attr "simd_type" "simd_dup")
+  "@
+   dup\\t%0.<Vtype>, %<vw>1
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "simd_type" "simd_dupgp, simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )

@ -366,6 +355,32 @@
   (set_attr "simd_mode" "<MODE>")]
 )

+(define_insn "aarch64_dup_lane<mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_duplicate:VALL
+	  (vec_select:<VEL>
+	    (match_operand:VALL 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
+          )))]
+  "TARGET_SIMD"
+  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
+  [(set_attr "simd_type" "simd_dup")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_duplicate:VALL
+	  (vec_select:<VEL>
+	    (match_operand:<VSWAP_WIDTH> 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
+          )))]
+  "TARGET_SIMD"
+  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
+  [(set_attr "simd_type" "simd_dup")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 (define_insn "*aarch64_simd_mov<mode>"
  [(set (match_operand:VD 0 "aarch64_simd_nonimmediate_operand"
 		"=w, Utv,  w, ?r, ?w, ?r, w")
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -7931,6 +7931,55 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
  return true;
 }

+static bool
+aarch64_evpc_dup (struct expand_vec_perm_d *d)
+{
+  rtx (*gen) (rtx, rtx, rtx);
+  rtx out = d->target;
+  rtx in0;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, elt, nelt = d->nelt;
+  rtx lane;
+
+  /* TODO: This may not be big-endian safe.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; i++)
+    {
+      if (elt != d->perm[i])
+	return false;
+    }
+
+  /* The generic preparation in aarch64_expand_vec_perm_const_1
+     swaps the operand order and the permute indices if it finds
+     d->perm[0] to be in the second operand.  Thus, we can always
+     use d->op0 and need not do any extra arithmetic to get the
+     correct lane number.  */
+  in0 = d->op0;
+  lane = GEN_INT (elt);
+
+  switch (vmode)
+    {
+    case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
+    case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
+    case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
+    case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
+    case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
+    case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
+    case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
+    case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
+    case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
+    case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
+    default:
+      return false;
+    }
+
+  emit_insn (gen (out, in0, lane));
+  return true;
+}
+
 static bool
 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
 {
@ -7988,6 +8037,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
      else if (aarch64_evpc_trn (d))
 	return true;
+      else if (aarch64_evpc_dup (d))
+	return true;
      return aarch64_evpc_tbl (d);
    }
  return false;
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -383,7 +383,7 @@
 			(V4HI "V8HI") (V8HI "V8HI")
 			(V2SI "V4SI") (V4SI "V4SI")
 			(DI   "V2DI") (V2DI "V2DI")
-			(V2SF "V2SF") (V4SF "V4SF")
+			(V2SF "V4SF") (V4SF "V4SF")
 			(V2DF "V2DF") (SI   "V4SI")
 			(HI   "V8HI") (QI   "V16QI")])

@ -527,6 +527,20 @@
 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")])

+(define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
+				(V4HI "V8HI") (V8HI  "V4HI")
+				(V2SI "V4SI") (V4SI  "V2SI")
+				(DI   "V2DI") (V2DI  "DI")
+				(V2SF "V4SF") (V4SF  "V2SF")
+				(DF   "V2DF") (V2DF  "DF")])
+
+(define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64")
+				    (V4HI "to_128") (V8HI  "to_64")
+				    (V2SI "to_128") (V4SI  "to_64")
+				    (DI   "to_128") (V2DI  "to_64")
+				    (V2SF "to_128") (V4SF  "to_64")
+				    (DF   "to_128") (V2DF  "to_64")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,8 @@
+2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* gcc.target/aarch64/scalar_intrinsics.c
+	(vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers.
+
 2013-09-02  Richard Biener  <rguenther@suse.de>

 	PR middle-end/57511
--- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
@ -198,13 +198,21 @@ test_vcltzd_s64 (int64x1_t a)
 int8x1_t
 test_vdupb_lane_s8 (int8x16_t a)
 {
-  return vdupb_lane_s8 (a, 2);
+  int8x1_t res;
+  force_simd (a);
+  res = vdupb_laneq_s8 (a, 2);
+  force_simd (res);
+  return res;
 }

 uint8x1_t
 test_vdupb_lane_u8 (uint8x16_t a)
 {
-  return vdupb_lane_u8 (a, 2);
+  uint8x1_t res;
+  force_simd (a);
+  res = vdupb_laneq_u8 (a, 2);
+  force_simd (res);
+  return res;
 }

 /* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */
@ -212,13 +220,21 @@ test_vdupb_lane_u8 (uint8x16_t a)
 int16x1_t
 test_vduph_lane_s16 (int16x8_t a)
 {
-  return vduph_lane_s16 (a, 2);
+  int16x1_t res;
+  force_simd (a);
+  res = vduph_laneq_s16 (a, 2);
+  force_simd (res);
+  return res;
 }

 uint16x1_t
 test_vduph_lane_u16 (uint16x8_t a)
 {
-  return vduph_lane_u16 (a, 2);
+  uint16x1_t res;
+  force_simd (a);
+  res = vduph_laneq_u16 (a, 2);
+  force_simd (res);
+  return res;
 }

 /* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */
@ -226,13 +242,21 @@ test_vduph_lane_u16 (uint16x8_t a)
 int32x1_t
 test_vdups_lane_s32 (int32x4_t a)
 {
-  return vdups_lane_s32 (a, 2);
+  int32x1_t res;
+  force_simd (a);
+  res = vdups_laneq_s32 (a, 2);
+  force_simd (res);
+  return res;
 }

 uint32x1_t
 test_vdups_lane_u32 (uint32x4_t a)
 {
-  return vdups_lane_u32 (a, 2);
+  uint32x1_t res;
+  force_simd (a);
+  res = vdups_laneq_u32 (a, 2);
+  force_simd (res);
+  return res;
 }

 /* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */
@ -240,13 +264,21 @@ test_vdups_lane_u32 (uint32x4_t a)
 int64x1_t
 test_vdupd_lane_s64 (int64x2_t a)
 {
-  return vdupd_lane_s64 (a, 1);
+  int64x1_t res;
+  force_simd (a);
+  res = vdupd_laneq_s64 (a, 1);
+  force_simd (res);
+  return res;
 }

 uint64x1_t
 test_vdupd_lane_u64 (uint64x2_t a)
 {
-  return vdupd_lane_u64 (a, 1);
+  uint64x1_t res;
+  force_simd (a);
+  res = vdupd_laneq_u64 (a, 1);
+  force_simd (res);
+  return res;
 }

 /* { dg-final { scan-assembler-times "\\tcmtst\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 2 } } */