From bd7a34ef5564f4240c3839c89d7e695c9ef4e49d Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 18 Jan 2019 14:09:24 -0800
Subject: [PATCH] AVX512FP16: Add vaddph/vsubph/vdivph/vmulph.

gcc/ChangeLog:

	* config.gcc: Add avx512fp16vlintrin.h.
	* config/i386/avx512fp16intrin.h: (_mm512_add_ph): New intrinsic.
	(_mm512_mask_add_ph): Likewise.
	(_mm512_maskz_add_ph): Likewise.
	(_mm512_sub_ph): Likewise.
	(_mm512_mask_sub_ph): Likewise.
	(_mm512_maskz_sub_ph): Likewise.
	(_mm512_mul_ph): Likewise.
	(_mm512_mask_mul_ph): Likewise.
	(_mm512_maskz_mul_ph): Likewise.
	(_mm512_div_ph): Likewise.
	(_mm512_mask_div_ph): Likewise.
	(_mm512_maskz_div_ph): Likewise.
	(_mm512_add_round_ph): Likewise.
	(_mm512_mask_add_round_ph): Likewise.
	(_mm512_maskz_add_round_ph): Likewise.
	(_mm512_sub_round_ph): Likewise.
	(_mm512_mask_sub_round_ph): Likewise.
	(_mm512_maskz_sub_round_ph): Likewise.
	(_mm512_mul_round_ph): Likewise.
	(_mm512_mask_mul_round_ph): Likewise.
	(_mm512_maskz_mul_round_ph): Likewise.
	(_mm512_div_round_ph): Likewise.
	(_mm512_mask_div_round_ph): Likewise.
	(_mm512_maskz_div_round_ph): Likewise.
	* config/i386/avx512fp16vlintrin.h: New header.
	* config/i386/i386-builtin-types.def (V16HF, V8HF, V32HF):
	Add new builtin types.
	* config/i386/i386-builtin.def: Add corresponding builtins.
	* config/i386/i386-expand.c
	(ix86_expand_args_builtin): Handle new builtin types.
	(ix86_expand_round_builtin): Likewise.
	* config/i386/immintrin.h: Include avx512fp16vlintrin.h
	* config/i386/sse.md (VFH): New mode_iterator.
	(VF2H): Likewise.
	(avx512fmaskmode): Add HF vector modes.
	(avx512fmaskhalfmode): Likewise.
	(<plusminus_insn><mode>3<mask_name><round_name>): Adjust to for
	HF vector modes.
	(*<plusminus_insn><mode>3<mask_name><round_name>): Likewise.
	(mul<mode>3<mask_name><round_name>): Likewise.
	(*mul<mode>3<mask_name><round_name>): Likewise.
	(div<mode>3): Likewise.
	(<sse>_div<mode>3<mask_name><round_name>): Likewise.
	* config/i386/subst.md (SUBST_V): Add HF vector modes.
	(SUBST_A): Likewise.
	(round_mode512bit_condition): Adjust for V32HFmode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx-1.c: Add -mavx512vl and test for new intrinsics.
	* gcc.target/i386/avx-2.c: Add -mavx512vl.
	* gcc.target/i386/avx512fp16-11a.c: New test.
	* gcc.target/i386/avx512fp16-11b.c: Ditto.
	* gcc.target/i386/avx512vlfp16-11a.c: Ditto.
	* gcc.target/i386/avx512vlfp16-11b.c: Ditto.
	* gcc.target/i386/sse-13.c: Add test for new builtins.
	* gcc.target/i386/sse-23.c: Ditto.
	* gcc.target/i386/sse-14.c: Add test for new intrinsics.
	* gcc.target/i386/sse-22.c: Ditto.
---
 gcc/config.gcc                                |   2 +-
 gcc/config/i386/avx512fp16intrin.h            | 251 ++++++++++++++++++
 gcc/config/i386/avx512fp16vlintrin.h          | 219 +++++++++++++++
 gcc/config/i386/i386-builtin-types.def        |   7 +
 gcc/config/i386/i386-builtin.def              |  20 ++
 gcc/config/i386/i386-expand.c                 |   5 +
 gcc/config/i386/immintrin.h                   |   2 +
 gcc/config/i386/sse.md                        |  61 +++--
 gcc/config/i386/subst.md                      |   6 +-
 gcc/testsuite/gcc.target/i386/avx-1.c         |   8 +-
 gcc/testsuite/gcc.target/i386/avx-2.c         |   2 +-
 .../gcc.target/i386/avx512fp16-11a.c          |  36 +++
 .../gcc.target/i386/avx512fp16-11b.c          |  75 ++++++
 .../gcc.target/i386/avx512vlfp16-11a.c        |  68 +++++
 .../gcc.target/i386/avx512vlfp16-11b.c        |  96 +++++++
 gcc/testsuite/gcc.target/i386/sse-13.c        |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c        |  14 +
 gcc/testsuite/gcc.target/i386/sse-22.c        |  14 +
 gcc/testsuite/gcc.target/i386/sse-23.c        |   6 +
 19 files changed, 871 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/i386/avx512fp16vlintrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 710f9ce5717..ccf41f66e42 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -416,7 +416,7 @@ i[34567]86-*-* | x86_64-*-*)
 		       tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
 		       amxbf16intrin.h x86gprintrin.h uintrintrin.h
 		       hresetintrin.h keylockerintrin.h avxvnniintrin.h
-		       mwaitintrin.h avx512fp16intrin.h"
+		       mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h"
 	;;
 ia64-*-*)
 	extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 3fc0770986e..3e9d676dc39 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -217,6 +217,257 @@ _mm_store_sh (void *__P, __m128h __A)
   *(_Float16 *) __P = ((__v8hf)__A)[0];
 }
 
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A + (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vaddph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A - (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vsubph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A * (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vmulph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A / (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vdivph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+#else
+#define _mm512_add_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_add_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_add_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_sub_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_sub_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_sub_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_mul_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_mul_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_mul_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_div_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_div_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_div_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+#endif  /* __OPTIMIZE__  */
+
 #ifdef __DISABLE_AVX512FP16__
 #undef __DISABLE_AVX512FP16__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
new file mode 100644
index 00000000000..75fa9eb29e7
--- /dev/null
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -0,0 +1,219 @@
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
+#define __AVX512FP16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
+#pragma GCC push_options
+#pragma GCC target("avx512fp16,avx512vl")
+#define __DISABLE_AVX512FP16VL__
+#endif /* __AVX512FP16VL__ */
+
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A + (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A + (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vaddph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A - (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A - (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vsubph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A * (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A * (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vmulph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A / (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A / (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vdivph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+#ifdef __DISABLE_AVX512FP16VL__
+#undef __DISABLE_AVX512FP16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16VL__ */
+
+#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 4df6ee1009d..fdc46bd20b0 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -98,6 +98,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
 # AVX vectors
 DEF_VECTOR_TYPE (V4DF, DOUBLE)
 DEF_VECTOR_TYPE (V8SF, FLOAT)
+DEF_VECTOR_TYPE (V16HF, FLOAT16)
 DEF_VECTOR_TYPE (V4DI, DI)
 DEF_VECTOR_TYPE (V8SI, SI)
 DEF_VECTOR_TYPE (V16HI, HI)
@@ -108,6 +109,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
 
 # AVX512F vectors
 DEF_VECTOR_TYPE (V32SF, FLOAT)
+DEF_VECTOR_TYPE (V32HF, FLOAT16)
 DEF_VECTOR_TYPE (V16SF, FLOAT)
 DEF_VECTOR_TYPE (V8DF, DOUBLE)
 DEF_VECTOR_TYPE (V8DI, DI)
@@ -1302,3 +1304,8 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID)
 
 # FP16 builtins
 DEF_FUNCTION_TYPE (V8HF, V8HI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 4b1ae0eb84c..2f1520968e2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2774,6 +2774,20 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
 
+/* AVX512FP16.  */
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+
 /* Builtins with rounding support.  */
 BDESC_END (ARGS, ROUND_ARGS)
 
@@ -2973,6 +2987,12 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, "
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
 
+/* AVX512FP16.  */
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+
 BDESC_END (ROUND_ARGS, MULTI_ARG)
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index badbacc19d8..ad9c672919a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -10038,6 +10038,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
@@ -10055,6 +10056,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+    case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
@@ -10062,6 +10064,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+    case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
@@ -10738,6 +10741,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case INT_FTYPE_V4SF_INT:
       nargs = 2;
       break;
+    case V32HF_FTYPE_V32HF_V32HF_INT:
     case V4SF_FTYPE_V4SF_UINT_INT:
     case V4SF_FTYPE_V4SF_UINT64_INT:
     case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10778,6 +10782,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 2421a78637b..1761c75dd65 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -96,6 +96,8 @@
 
 #ifdef __SSE2__
 #include <avx512fp16intrin.h>
+
+#include <avx512fp16vlintrin.h>
 #endif
 
 #include <shaintrin.h>
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9c67750091f..06339163bc5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,6 +298,13 @@
   [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
    (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator VFH
+  [(V32HF "TARGET_AVX512FP16")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
 ;; 128- and 256-bit float vector modes
 (define_mode_iterator VF_128_256
   [(V8SF "TARGET_AVX") V4SF
@@ -321,6 +328,13 @@
 (define_mode_iterator VF2
   [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
 
+;; All DFmode & HFmode vector float modes
+(define_mode_iterator VF2H
+  [(V32HF "TARGET_AVX512FP16")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
+
 ;; 128- and 256-bit DF vector modes
 (define_mode_iterator VF2_128_256
   [(V4DF "TARGET_AVX") V2DF])
@@ -885,6 +899,7 @@
    (V32HI "HI") (V16HI "QI") (V8HI  "QI") (V4HI "QI")
    (V16SI "QI") (V8SI  "QI") (V4SI  "QI")
    (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V32HF "HI") (V16HF "QI") (V8HF  "QI")
    (V16SF "QI") (V8SF  "QI") (V4SF  "QI")
    (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
 
@@ -2032,18 +2047,18 @@
 })
 
 (define_expand "<insn><mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand")
-	(plusminus:VF
-	  (match_operand:VF 1 "<round_nimm_predicate>")
-	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VFH 0 "register_operand")
+	(plusminus:VFH
+	  (match_operand:VFH 1 "<round_nimm_predicate>")
+	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<insn><mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(plusminus:VF
-	  (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(plusminus:VFH
+	  (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
    && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
@@ -2121,18 +2136,18 @@
 })
 
 (define_expand "mul<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand")
-	(mult:VF
-	  (match_operand:VF 1 "<round_nimm_predicate>")
-	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VFH 0 "register_operand")
+	(mult:VFH
+	  (match_operand:VFH 1 "<round_nimm_predicate>")
+	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
 
 (define_insn "*mul<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(mult:VF
-	  (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(mult:VFH
+	  (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "%0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands)
    && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
@@ -2195,9 +2210,9 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "div<mode>3"
-  [(set (match_operand:VF2 0 "register_operand")
-	(div:VF2 (match_operand:VF2 1 "register_operand")
-		 (match_operand:VF2 2 "vector_operand")))]
+  [(set (match_operand:VF2H 0 "register_operand")
+	(div:VF2H (match_operand:VF2H 1 "register_operand")
+		  (match_operand:VF2H 2 "vector_operand")))]
   "TARGET_SSE2")
 
 (define_expand "div<mode>3"
@@ -2236,10 +2251,10 @@
 })
 
 (define_insn "<sse>_div<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(div:VF
-	  (match_operand:VF 1 "register_operand" "0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(div:VFH
+	  (match_operand:VFH 1 "register_operand" "0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
    div<ssemodesuffix>\t{%2, %0|%0, %2}
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 6614e044857..94426a5972b 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -24,6 +24,7 @@
    V32HI V16HI V8HI
    V16SI V8SI  V4SI
    V8DI  V4DI  V2DI
+   V32HF V16HF V8HF
    V16SF V8SF  V4SF
    V8DF  V4DF  V2DF])
 
@@ -35,6 +36,7 @@
    V32HI V16HI V8HI
    V16SI V8SI  V4SI
    V8DI  V4DI  V2DI
+   V32HF V16HF V8HF
    V16SF V8SF  V4SF
    V8DF  V4DF  V2DF
    QI HI SI DI SF DF])
@@ -161,7 +163,9 @@
 (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode
 							      || <MODE>mode == V8DFmode
 							      || <MODE>mode == V8DImode
-							      || <MODE>mode == V16SImode)")
+							      || <MODE>mode == V16SImode
+							      || <MODE>mode == V32HFmode)")
+
 (define_subst_attr "round_modev8sf_condition" "round" "1" "(<MODE>mode == V8SFmode)")
 (define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)")
 (define_subst_attr "round_codefor" "round" "*" "")
diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c
index f3676077743..1eaee861141 100644
--- a/gcc/testsuite/gcc.target/i386/avx-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
@@ -685,6 +685,12 @@
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
diff --git a/gcc/testsuite/gcc.target/i386/avx-2.c b/gcc/testsuite/gcc.target/i386/avx-2.c
index 1751c52565c..642ae4d7bfb 100644
--- a/gcc/testsuite/gcc.target/i386/avx-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16 -mavx512vl" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
new file mode 100644
index 00000000000..28492fa3f7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
@@ -0,0 +1,36 @@
+/* { dg-do compile} */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+#include <immintrin.h>
+__m512h
+__attribute__ ((noinline, noclone))
+vadd512 (__m512h a, __m512h b)
+{
+  return a + b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vsub512 (__m512h a, __m512h b)
+{
+  return a - b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vmul512 (__m512h a, __m512h b)
+{
+  return a * b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vdiv512 (__m512h a, __m512h b)
+{
+  return a / b;
+}
+
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
new file mode 100644
index 00000000000..5f51a5bb085
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mfpmath=sse" } */
+
+#include <string.h>
+#include <stdlib.h>
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+#include "avx512fp16-11a.c"
+
+/* Get random float16 between -50.x to 50.x.  */
+_Float16
+get_float16_noround()
+{
+  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
+    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
+}
+
+static void
+do_test (void)
+{
+  _Float16 x[32];
+  _Float16 y[32];
+  _Float16 res_add[32];
+  _Float16 res_sub[32];
+  _Float16 res_mul[32];
+  _Float16 res_div[32];
+  for (int i = 0 ; i != 32; i++)
+    {
+      x[i] = get_float16_noround ();
+      y[i] = get_float16_noround ();
+      if (y[i] == 0)
+	y[i] = 1.0f;
+      res_add[i] = x[i] + y[i];
+      res_sub[i] = x[i] - y[i];
+      res_mul[i] = x[i] * y[i];
+      res_div[i] = x[i] / y[i];
+
+    }
+
+  union512h u512 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15],
+      x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+      x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31] };
+  union512h u512_1 = {y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
+      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15],
+      y[16], y[17], y[18], y[19], y[20], y[21], y[22], y[23],
+      y[24], y[25], y[26], y[27], y[28], y[29], y[30], y[31] };
+
+  __m512h v512;
+  union512h a512;
+
+  memset (&v512, -1, sizeof (v512));
+  v512 = vadd512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_add))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vsub512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_sub))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vmul512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_mul))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vdiv512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_div))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
new file mode 100644
index 00000000000..a8c6296f504
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
@@ -0,0 +1,68 @@
+/* { dg-do compile} */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+#include <immintrin.h>
+__m128h
+__attribute__ ((noinline, noclone))
+vadd128 (__m128h a, __m128h b)
+{
+  return a + b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vadd256 (__m256h a, __m256h b)
+{
+  return a + b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vsub128 (__m128h a, __m128h b)
+{
+  return a - b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vsub256 (__m256h a, __m256h b)
+{
+  return a - b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vmul128 (__m128h a, __m128h b)
+{
+  return a * b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vmul256 (__m256h a, __m256h b)
+{
+  return a * b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vdiv128 (__m128h a, __m128h b)
+{
+  return a / b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vdiv256 (__m256h a, __m256h b)
+{
+  return a / b;
+}
+
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
new file mode 100644
index 00000000000..b8d3e8a4e96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
@@ -0,0 +1,96 @@
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+#include <string.h>
+#include <stdlib.h>
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+#include "avx512vlfp16-11a.c"
+
+/* Get random float16 between -50.x to 50.x.  */
+_Float16
+get_float16_noround()
+{
+  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
+    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
+}
+
+static void
+do_test (void)
+{
+  _Float16 x[16];
+  _Float16 y[16];
+  _Float16 res_add[16];
+  _Float16 res_sub[16];
+  _Float16 res_mul[16];
+  _Float16 res_div[16];
+  for (int i = 0 ; i != 16; i++)
+    {
+      x[i] = get_float16_noround ();
+      y[i] = get_float16_noround ();
+      if (y[i] == 0)
+	y[i] = 1.0f;
+      res_add[i] = x[i] + y[i];
+      res_sub[i] = x[i] - y[i];
+      res_mul[i] = x[i] * y[i];
+      res_div[i] = x[i] / y[i];
+
+    }
+
+  union128h u128 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7] };
+  union128h u128_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7] };
+  union256h u256 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] };
+  union256h u256_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
+      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15]};
+
+  __m128h v128;
+  __m256h v256;
+  union128h a128;
+  union256h a256;
+
+  memset (&v128, -1, sizeof (v128));
+  v128 = vadd128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_add))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vsub128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_sub))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vmul128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_mul))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vdiv128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_div))
+    abort ();
+
+  memset (&v256, -1, sizeof (v256));
+  v256 = vadd256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_add))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vsub256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_sub))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vmul256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_mul))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vdiv256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_div))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index f5f5c113612..50ed74cd6d6 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -702,6 +702,12 @@
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index 747d504cedb..26a5e94c7ca 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -667,6 +667,20 @@ test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 8)
 test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 8)
 test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 8)
 
+/* avx512fp16intrin.h */
+test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+
 /* shaintrin.h */
 test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
 
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
index 33411969901..8d25effd724 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -772,6 +772,20 @@ test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 8)
 test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 8)
 test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 8)
 
+/* avx512fp16intrin.h */
+test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+
 /* shaintrin.h */
 test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
 
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index 86590ca5ffb..f7dd5d7495c 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -703,6 +703,12 @@
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1)