[AArch64][2/10] ARMv8.2-A FP16 one operand vector intrinsics

gcc/ * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New. * config/aarch64/aarch64-simd-builtins.def: Register new builtins. * config/aarch64/aarch64-simd.md (aarch64_rsqrte<mode>): Extend to HF modes. (neg<mode>2): Likewise. (abs<mode>2): Likewise. (<frint_pattern><mode>2): Likewise. (l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2): Likewise. (<optab><VDQF:mode><fcvt_target>2): Likewise. (<fix_trunc_optab><VDQF:mode><fcvt_target>2): Likewise. (ftrunc<VDQF:mode>2): Likewise. (<optab><fcvt_target><VDQF:mode>2): Likewise. (sqrt<mode>2): Likewise. (*sqrt<mode>2): Likewise. (aarch64_frecpe<mode>): Likewise. (aarch64_cm<optab><mode>): Likewise. * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Return false for V4HF and V8HF. * config/aarch64/iterators.md (VHSDF, VHSDF_DF, VHSDF_SDF): New. (VDQF_COND, fcvt_target, FCVT_TARGET, hcon): Extend mode attribute to HF modes. (stype): New. * config/aarch64/arm_neon.h (vdup_n_f16): New. (vdupq_n_f16): Likewise. (vld1_dup_f16): Use vdup_n_f16. (vld1q_dup_f16): Use vdupq_n_f16. (vabs_f16): New. (vabsq_f16, vceqz_f16, vceqzq_f16, vcgez_f16, vcgezq_f16, vcgtz_f16, vcgtzq_f16, vclez_f16, vclezq_f16, vcltz_f16, vcltzq_f16, vcvt_f16_s16, vcvtq_f16_s16, vcvt_f16_u16, vcvtq_f16_u16, vcvt_s16_f16, vcvtq_s16_f16, vcvt_u16_f16, vcvtq_u16_f16, vcvta_s16_f16, vcvtaq_s16_f16, vcvta_u16_f16, vcvtaq_u16_f16, vcvtm_s16_f16, vcvtmq_s16_f16, vcvtm_u16_f16, vcvtmq_u16_f16, vcvtn_s16_f16, vcvtnq_s16_f16, vcvtn_u16_f16, vcvtnq_u16_f16, vcvtp_s16_f16, vcvtpq_s16_f16, vcvtp_u16_f16, vcvtpq_u16_f16, vneg_f16, vnegq_f16, vrecpe_f16, vrecpeq_f16, vrnd_f16, vrndq_f16, vrnda_f16, vrndaq_f16, vrndi_f16, vrndiq_f16, vrndm_f16, vrndmq_f16, vrndn_f16, vrndnq_f16, vrndp_f16, vrndpq_f16, vrndx_f16, vrndxq_f16, vrsqrte_f16, vrsqrteq_f16, vsqrt_f16, vsqrtq_f16): Likewise. From-SVN: r238716
2016-07-25 14:20:37 +00:00 · 2016-07-25 14:20:37 +00:00 · daef0a8c7e
parent 358decd5bb
commit daef0a8c7e
7 changed files with 523 additions and 59 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,43 @@
+2016-07-25  Jiong Wang  <jiong.wang@arm.com>
+
+	* config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New.
+	* config/aarch64/aarch64-simd-builtins.def: Register new builtins.
+	* config/aarch64/aarch64-simd.md (aarch64_rsqrte<mode>): Extend to HF modes.
+	(neg<mode>2): Likewise.
+	(abs<mode>2): Likewise.
+	(<frint_pattern><mode>2): Likewise.
+	(l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2): Likewise.
+	(<optab><VDQF:mode><fcvt_target>2): Likewise.
+	(<fix_trunc_optab><VDQF:mode><fcvt_target>2): Likewise.
+	(ftrunc<VDQF:mode>2): Likewise.
+	(<optab><fcvt_target><VDQF:mode>2): Likewise.
+	(sqrt<mode>2): Likewise.
+	(*sqrt<mode>2): Likewise.
+	(aarch64_frecpe<mode>): Likewise.
+	(aarch64_cm<optab><mode>): Likewise.
+	* config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Return
+	false for V4HF and V8HF.
+	* config/aarch64/iterators.md (VHSDF, VHSDF_DF, VHSDF_SDF): New.
+	(VDQF_COND, fcvt_target, FCVT_TARGET, hcon): Extend mode attribute to HF modes.
+	(stype): New.
+	* config/aarch64/arm_neon.h (vdup_n_f16): New.
+	(vdupq_n_f16): Likewise.
+	(vld1_dup_f16): Use vdup_n_f16.
+	(vld1q_dup_f16): Use vdupq_n_f16.
+	(vabs_f16): New.
+	(vabsq_f16, vceqz_f16, vceqzq_f16, vcgez_f16, vcgezq_f16, vcgtz_f16,
+	vcgtzq_f16, vclez_f16, vclezq_f16, vcltz_f16, vcltzq_f16, vcvt_f16_s16,
+	vcvtq_f16_s16, vcvt_f16_u16, vcvtq_f16_u16, vcvt_s16_f16, vcvtq_s16_f16,
+	vcvt_u16_f16, vcvtq_u16_f16, vcvta_s16_f16, vcvtaq_s16_f16,
+	vcvta_u16_f16, vcvtaq_u16_f16, vcvtm_s16_f16, vcvtmq_s16_f16,
+	vcvtm_u16_f16, vcvtmq_u16_f16, vcvtn_s16_f16, vcvtnq_s16_f16,
+	vcvtn_u16_f16, vcvtnq_u16_f16, vcvtp_s16_f16, vcvtpq_s16_f16,
+	vcvtp_u16_f16, vcvtpq_u16_f16, vneg_f16, vnegq_f16, vrecpe_f16,
+	vrecpeq_f16, vrnd_f16, vrndq_f16, vrnda_f16, vrndaq_f16, vrndi_f16,
+	vrndiq_f16, vrndm_f16, vrndmq_f16, vrndn_f16, vrndnq_f16, vrndp_f16,
+	vrndpq_f16, vrndx_f16, vrndxq_f16, vrsqrte_f16, vrsqrteq_f16, vsqrt_f16,
+	vsqrtq_f16): Likewise.
+
 2016-07-25  Jiong Wang  <jiong.wang@arm.com>

 	* config/aarch64/aarch64-simd.md
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@ -139,6 +139,10 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
  = { qualifier_none, qualifier_none, qualifier_unsigned };
 #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
  = { qualifier_poly, qualifier_poly, qualifier_poly };
 #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@ -42,7 +42,7 @@
  BUILTIN_VDC (COMBINE, combine, 0)
  BUILTIN_VB (BINOP, pmul, 0)
  BUILTIN_VALLF (BINOP, fmulx, 0)
-  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
  BUILTIN_VD_BHSI (BINOP, addp, 0)
  VAR1 (UNOP, addp, 0, di)
  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
@ -266,23 +266,29 @@
  BUILTIN_VDQF (BINOP, smin_nanp, 0)

  /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VDQF (UNOP, btrunc, 2)
-  BUILTIN_VDQF (UNOP, ceil, 2)
-  BUILTIN_VDQF (UNOP, floor, 2)
-  BUILTIN_VDQF (UNOP, nearbyint, 2)
-  BUILTIN_VDQF (UNOP, rint, 2)
-  BUILTIN_VDQF (UNOP, round, 2)
-  BUILTIN_VDQF_DF (UNOP, frintn, 2)
+  BUILTIN_VHSDF (UNOP, btrunc, 2)
+  BUILTIN_VHSDF (UNOP, ceil, 2)
+  BUILTIN_VHSDF (UNOP, floor, 2)
+  BUILTIN_VHSDF (UNOP, nearbyint, 2)
+  BUILTIN_VHSDF (UNOP, rint, 2)
+  BUILTIN_VHSDF (UNOP, round, 2)
+  BUILTIN_VHSDF_DF (UNOP, frintn, 2)

  /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
+  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
+  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
  VAR1 (UNOP, lbtruncv2df, 2, v2di)

+  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)

+  VAR1 (UNOP, lroundv4hf, 2, v4hi)
+  VAR1 (UNOP, lroundv8hf, 2, v8hi)
  VAR1 (UNOP, lroundv2sf, 2, v2si)
  VAR1 (UNOP, lroundv4sf, 2, v4si)
  VAR1 (UNOP, lroundv2df, 2, v2di)
@ -290,38 +296,52 @@
  VAR1 (UNOP, lroundsf, 2, si)
  VAR1 (UNOP, lrounddf, 2, di)

+  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
  VAR1 (UNOPUS, lroundusf, 2, si)
  VAR1 (UNOPUS, lroundudf, 2, di)

+  VAR1 (UNOP, lceilv4hf, 2, v4hi)
+  VAR1 (UNOP, lceilv8hf, 2, v8hi)
  VAR1 (UNOP, lceilv2sf, 2, v2si)
  VAR1 (UNOP, lceilv4sf, 2, v4si)
  VAR1 (UNOP, lceilv2df, 2, v2di)

+  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
  VAR1 (UNOPUS, lceilusf, 2, si)
  VAR1 (UNOPUS, lceiludf, 2, di)

+  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
+  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
  VAR1 (UNOP, lfloorv2sf, 2, v2si)
  VAR1 (UNOP, lfloorv4sf, 2, v4si)
  VAR1 (UNOP, lfloorv2df, 2, v2di)

+  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
  VAR1 (UNOPUS, lfloorusf, 2, si)
  VAR1 (UNOPUS, lfloorudf, 2, di)

+  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
+  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
  VAR1 (UNOP, lfrintnv2df, 2, v2di)
  VAR1 (UNOP, lfrintnsf, 2, si)
  VAR1 (UNOP, lfrintndf, 2, di)

+  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
+  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
@ -329,10 +349,14 @@
  VAR1 (UNOPUS, lfrintnudf, 2, di)

  /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
+  VAR1 (UNOP, floatv4hi, 2, v4hf)
+  VAR1 (UNOP, floatv8hi, 2, v8hf)
  VAR1 (UNOP, floatv2si, 2, v2sf)
  VAR1 (UNOP, floatv4si, 2, v4sf)
  VAR1 (UNOP, floatv2di, 2, v2df)

+  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
+  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
  VAR1 (UNOP, floatunsv2si, 2, v2sf)
  VAR1 (UNOP, floatunsv4si, 2, v4sf)
  VAR1 (UNOP, floatunsv2di, 2, v2df)
@ -358,13 +382,13 @@

  BUILTIN_VDQ_SI (UNOP, urecpe, 0)

-  BUILTIN_VDQF (UNOP, frecpe, 0)
+  BUILTIN_VHSDF (UNOP, frecpe, 0)
  BUILTIN_VDQF (BINOP, frecps, 0)

  /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
     only ever used for the int64x1_t intrinsic, there is no scalar version.  */
  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VDQF (UNOP, abs, 2)
+  BUILTIN_VHSDF (UNOP, abs, 2)

  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
@ -457,7 +481,7 @@
  BUILTIN_VALLF (SHIFTIMM_USS, fcvtzu, 3)

  /* Implemented by aarch64_rsqrte<mode>.  */
-  BUILTIN_VALLF (UNOP, rsqrte, 0)
+  BUILTIN_VHSDF_SDF (UNOP, rsqrte, 0)

  /* Implemented by aarch64_rsqrts<mode>.  */
  BUILTIN_VALLF (BINOP, rsqrts, 0)
@ -467,3 +491,13 @@

  /* Implemented by aarch64_faddp<mode>.  */
  BUILTIN_VDQF (BINOP, faddp, 0)
+
+  /* Implemented by aarch64_cm<optab><mode>.  */
+  BUILTIN_VHSDF_SDF (BINOP_USS, cmeq, 0)
+  BUILTIN_VHSDF_SDF (BINOP_USS, cmge, 0)
+  BUILTIN_VHSDF_SDF (BINOP_USS, cmgt, 0)
+  BUILTIN_VHSDF_SDF (BINOP_USS, cmle, 0)
+  BUILTIN_VHSDF_SDF (BINOP_USS, cmlt, 0)
+
+  /* Implemented by neg<mode>2.  */
+  BUILTIN_VHSDF (UNOP, neg, 2)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -383,12 +383,12 @@
 )

 (define_insn "aarch64_rsqrte<mode>"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+  [(set (match_operand:VHSDF_SDF 0 "register_operand" "=w")
+	(unspec:VHSDF_SDF [(match_operand:VHSDF_SDF 1 "register_operand" "w")]
 		     UNSPEC_RSQRTE))]
  "TARGET_SIMD"
  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
-  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+  [(set_attr "type" "neon_fp_rsqrte_<stype><q>")])

 (define_insn "aarch64_rsqrts<mode>"
  [(set (match_operand:VALLF 0 "register_operand" "=w")
@ -1565,19 +1565,19 @@
 )

 (define_insn "neg<mode>2"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (neg:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
 "TARGET_SIMD"
 "fneg\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_neg_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_neg_<stype><q>")]
 )

 (define_insn "abs<mode>2"
- [(set (match_operand:VDQF 0 "register_operand" "=w")
-       (abs:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
 "TARGET_SIMD"
 "fabs\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_abs_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_abs_<stype><q>")]
 )

 (define_insn "fma<mode>4"
@ -1735,24 +1735,24 @@
 ;; Vector versions of the floating-point frint patterns.
 ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
 (define_insn "<frint_pattern><mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
-		      FRINT))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+		       FRINT))]
  "TARGET_SIMD"
  "frint<frint_suffix>\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_round_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_round_<stype><q>")]
 )

 ;; Vector versions of the fcvt standard patterns.
 ;; Expands to lbtrunc, lround, lceil, lfloor
-(define_insn "l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2"
+(define_insn "l<fcvt_pattern><su_optab><VHSDF:mode><fcvt_target>2"
  [(set (match_operand:<FCVT_TARGET> 0 "register_operand" "=w")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand" "w")]
+			       [(match_operand:VHSDF 1 "register_operand" "w")]
 			       FCVT)))]
  "TARGET_SIMD"
  "fcvt<frint_suffix><su>\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_to_int_<stype><q>")]
 )

 (define_insn "*aarch64_fcvt<su_optab><VDQF:mode><fcvt_target>2_mult"
@ -1775,36 +1775,36 @@
  [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
 )

-(define_expand "<optab><VDQF:mode><fcvt_target>2"
+(define_expand "<optab><VHSDF:mode><fcvt_target>2"
  [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand")]
-			       UNSPEC_FRINTZ)))]
+			       [(match_operand:VHSDF 1 "register_operand")]
+				UNSPEC_FRINTZ)))]
  "TARGET_SIMD"
  {})

-(define_expand "<fix_trunc_optab><VDQF:mode><fcvt_target>2"
+(define_expand "<fix_trunc_optab><VHSDF:mode><fcvt_target>2"
  [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
-			       [(match_operand:VDQF 1 "register_operand")]
-			       UNSPEC_FRINTZ)))]
+			       [(match_operand:VHSDF 1 "register_operand")]
+				UNSPEC_FRINTZ)))]
  "TARGET_SIMD"
  {})

-(define_expand "ftrunc<VDQF:mode>2"
-  [(set (match_operand:VDQF 0 "register_operand")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
-		      UNSPEC_FRINTZ))]
+(define_expand "ftrunc<VHSDF:mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
+		       UNSPEC_FRINTZ))]
  "TARGET_SIMD"
  {})

-(define_insn "<optab><fcvt_target><VDQF:mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(FLOATUORS:VDQF
+(define_insn "<optab><fcvt_target><VHSDF:mode>2"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(FLOATUORS:VHSDF
 	  (match_operand:<FCVT_TARGET> 1 "register_operand" "w")))]
  "TARGET_SIMD"
  "<su_optab>cvtf\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_int_to_fp_<Vetype><q>")]
+  [(set_attr "type" "neon_int_to_fp_<stype><q>")]
 )

 ;; Conversions between vectors of floats and doubles.
@ -4296,14 +4296,14 @@
  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
 	(neg:<V_cmp_result>
 	  (COMPARISONS:<V_cmp_result>
-	    (match_operand:VALLF 1 "register_operand" "w,w")
-	    (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz")
+	    (match_operand:VHSDF_SDF 1 "register_operand" "w,w")
+	    (match_operand:VHSDF_SDF 2 "aarch64_simd_reg_or_zero" "w,YDz")
 	  )))]
  "TARGET_SIMD"
  "@
  fcm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>
  fcm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0"
-  [(set_attr "type" "neon_fp_compare_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_compare_<stype><q>")]
 )

 ;; fac(ge|gt)
@ -4348,8 +4348,8 @@
 ;; sqrt

 (define_expand "sqrt<mode>2"
-  [(set (match_operand:VDQF 0 "register_operand")
-	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
  "TARGET_SIMD"
 {
  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
@ -4357,11 +4357,11 @@
 })

 (define_insn "*sqrt<mode>2"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-        (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
  "TARGET_SIMD"
  "fsqrt\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_sqrt_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_sqrt_<stype><q>")]
 )

 ;; Patterns for vector struct loads and stores.
@ -5413,12 +5413,12 @@
 )

 (define_insn "aarch64_frecpe<mode>"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
-		    UNSPEC_FRECPE))]
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+	 UNSPEC_FRECPE))]
  "TARGET_SIMD"
  "frecpe\\t%0.<Vtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")]
+  [(set_attr "type" "neon_fp_recpe_<stype><q>")]
 )

 (define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -7485,6 +7485,10 @@ bool
 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
  machine_mode mode = GET_MODE (dst);
+
+  if (GET_MODE_INNER (mode) == HFmode)
+    return false;
+
  machine_mode mmsk = mode_for_vector
 		        (int_mode_for_mode (GET_MODE_INNER (mode)),
 			 GET_MODE_NUNITS (mode));
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@ -26028,6 +26028,365 @@ __INTERLEAVE_LIST (zip)

 /* End of optimal implementations in approved order.  */

+#pragma GCC pop_options
+
+/* ARMv8.2-A FP16 intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+/* ARMv8.2-A FP16 one operand vector intrinsics.  */
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vabs_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_absv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vabsq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_absv8hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceqz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgez_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgezq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgtz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclez_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclezq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcltz_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltzq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_f16_s16 (int16x4_t __a)
+{
+  return __builtin_aarch64_floatv4hiv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_f16_s16 (int16x8_t __a)
+{
+  return __builtin_aarch64_floatv8hiv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_f16_u16 (uint16x4_t __a)
+{
+  return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_f16_u16 (uint16x8_t __a)
+{
+  return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvt_s16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lbtruncv4hfv4hi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lbtruncv8hfv8hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvt_u16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtq_u16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvta_s16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lroundv4hfv4hi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtaq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lroundv8hfv8hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvta_u16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lrounduv4hfv4hi_us (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtaq_u16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lrounduv8hfv8hi_us (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtm_s16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lfloorv4hfv4hi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtmq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lfloorv8hfv8hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtm_u16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lflooruv4hfv4hi_us (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtmq_u16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lflooruv8hfv8hi_us (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtn_s16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lfrintnv4hfv4hi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtnq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lfrintnv8hfv8hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtn_u16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtnq_u16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtp_s16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lceilv4hfv4hi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtpq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lceilv8hfv8hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtp_u16_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_lceiluv4hfv4hi_us (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtpq_u16_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_lceiluv8hfv8hi_us (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vneg_f16 (float16x4_t __a)
+{
+  return -__a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vnegq_f16 (float16x8_t __a)
+{
+  return -__a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrecpe_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_frecpev4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrecpeq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_frecpev8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrnd_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_btruncv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_btruncv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrnda_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_roundv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndaq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_roundv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndi_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_nearbyintv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndiq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_nearbyintv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndm_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_floorv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndmq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_floorv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndn_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_frintnv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndnq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_frintnv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndp_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_ceilv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndpq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_ceilv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndx_f16 (float16x4_t __a)
+{
+  return __builtin_aarch64_rintv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndxq_f16 (float16x8_t __a)
+{
+  return __builtin_aarch64_rintv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrsqrte_f16 (float16x4_t a)
+{
+  return __builtin_aarch64_rsqrtev4hf (a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrsqrteq_f16 (float16x8_t a)
+{
+  return __builtin_aarch64_rsqrtev8hf (a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vsqrt_f16 (float16x4_t a)
+{
+  return __builtin_aarch64_sqrtv4hf (a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vsqrtq_f16 (float16x8_t a)
+{
+  return __builtin_aarch64_sqrtv8hf (a);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any

 #undef __aarch64_vdup_lane_any
@ -26084,6 +26443,4 @@ __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdupq_laneq_u32
 #undef __aarch64_vdupq_laneq_u64

-#pragma GCC pop_options
-
 #endif
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -88,11 +88,20 @@
 ;; Vector Float modes suitable for moving, loading and storing.
 (define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])

-;; Vector Float modes, barring HF modes.
+;; Vector Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+(define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
+			     (V8HF "TARGET_SIMD_F16INST")
+			     V2SF V4SF V2DF])

 ;; Vector Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
+(define_mode_iterator VHSDF_DF [(V4HF "TARGET_SIMD_F16INST")
+				(V8HF "TARGET_SIMD_F16INST")
+				V2SF V4SF V2DF DF])
+(define_mode_iterator VHSDF_SDF [(V4HF "TARGET_SIMD_F16INST")
+				 (V8HF "TARGET_SIMD_F16INST")
+				 V2SF V4SF V2DF SF DF])

 ;; Vector single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
@ -366,7 +375,8 @@
 		    (V4HI "") (V8HI "")
 		    (V2SI "") (V4SI  "")
 		    (V2DI "") (V2SF "")
-		    (V4SF "") (V2DF "")])
+		    (V4SF "") (V4HF "")
+		    (V8HF "") (V2DF "")])

 ;; For scalar usage of vector/FP registers, narrowing
 (define_mode_attr vn2 [(QI "") (HI "b") (SI "h") (DI "s")
@ -447,6 +457,16 @@
 			  (QI "b")   (HI "h")
 			  (SI "s")   (DI "d")])

+;; Vetype is used everywhere in scheduling type and assembly output,
+;; sometimes they are not the same, for example HF modes on some
+;; instructions.  stype is defined to represent scheduling type
+;; more accurately.
+(define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
+			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
+			 (SI "s") (DI "d")])
+
 ;; Mode-to-bitwise operation type mapping.
 (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
 			  (V4HI "8b") (V8HI  "16b")
@ -656,10 +676,14 @@

 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
 			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
-			       (SF "si") (DF "di") (SI "sf") (DI "df")])
+			       (SF "si") (DF "di") (SI "sf") (DI "df")
+			       (V4HF "v4hi") (V8HF "v8hi") (V4HI "v4hf")
+			       (V8HI "v8hf")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
 			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
-			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
+			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")
+			       (V4HF "V4HI") (V8HF "V8HI") (V4HI "V4HF")
+			       (V8HI "V8HF")])


 ;; for the inequal width integer to fp conversions
@ -687,6 +711,7 @@
 ;; the 'x' constraint.  All other modes may use the 'w' constraint.
 (define_mode_attr h_con [(V2SI "w") (V4SI "w")
 			 (V4HI "x") (V8HI "x")
+			 (V4HF "w") (V8HF "w")
 			 (V2SF "w") (V4SF "w")
 			 (V2DF "w") (DF "w")])