diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 4c65c70e628b..d84456924563 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -567,7 +567,8 @@ static int __init camellia_aesni_init(void)
 		return -ENODEV;
 	}
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index bacaa13acac5..93d8f295784e 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -559,7 +559,8 @@ static int __init camellia_aesni_init(void)
 		return -ENODEV;
 	}
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index be00aa48b2b5..8648158f3916 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -469,7 +469,8 @@ static int __init cast5_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 5dbba7224221..fca459578c35 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -591,7 +591,8 @@ static int __init cast6_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index effe2160b7c5..722bacea040e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -130,7 +130,7 @@ static int __init chacha20_simd_mod_init(void)
 
 #ifdef CONFIG_AS_AVX2
 	chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
-			    cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
 	return crypto_register_alg(&alg);
 }
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f7170d764f32..4264a3d59589 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -184,7 +184,7 @@ static int __init poly1305_simd_mod_init(void)
 
 #ifdef CONFIG_AS_AVX2
 	poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
-			    cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 	alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
 	if (poly1305_use_avx2)
 		alg.descsize += 10 * sizeof(u32);
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 7d838dc4d888..6d198342e2de 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -542,7 +542,8 @@ static int __init init(void)
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index da7dafc9b16d..5dc37026c7ce 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -597,7 +597,8 @@ static int __init serpent_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7c48e8b20848..00212c32d4db 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -121,7 +121,7 @@ static struct shash_alg alg = {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f8097fc0d1d1..0e0e85aea634 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -130,7 +130,7 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 2edad7b81870..0c8c38c101ac 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -129,7 +129,7 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		if (cpu_has_avx)
 			pr_info("AVX detected but unusable.\n");
 		return false;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index c2bd0ce718ee..b7a3904b953c 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -558,7 +558,7 @@ static int __init twofish_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
 		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index c49c5173158e..1c6f6ac52ad0 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -95,63 +95,122 @@ struct swregs_state {
 /*
  * List of XSAVE features Linux knows about:
  */
-enum xfeature_bit {
-	XSTATE_BIT_FP,
-	XSTATE_BIT_SSE,
-	XSTATE_BIT_YMM,
-	XSTATE_BIT_BNDREGS,
-	XSTATE_BIT_BNDCSR,
-	XSTATE_BIT_OPMASK,
-	XSTATE_BIT_ZMM_Hi256,
-	XSTATE_BIT_Hi16_ZMM,
+enum xfeature {
+	XFEATURE_FP,
+	XFEATURE_SSE,
+	/*
+	 * Values above here are "legacy states".
+	 * Those below are "extended states".
+	 */
+	XFEATURE_YMM,
+	XFEATURE_BNDREGS,
+	XFEATURE_BNDCSR,
+	XFEATURE_OPMASK,
+	XFEATURE_ZMM_Hi256,
+	XFEATURE_Hi16_ZMM,
 
-	XFEATURES_NR_MAX,
+	XFEATURE_MAX,
 };
 
-#define XSTATE_FP		(1 << XSTATE_BIT_FP)
-#define XSTATE_SSE		(1 << XSTATE_BIT_SSE)
-#define XSTATE_YMM		(1 << XSTATE_BIT_YMM)
-#define XSTATE_BNDREGS		(1 << XSTATE_BIT_BNDREGS)
-#define XSTATE_BNDCSR		(1 << XSTATE_BIT_BNDCSR)
-#define XSTATE_OPMASK		(1 << XSTATE_BIT_OPMASK)
-#define XSTATE_ZMM_Hi256	(1 << XSTATE_BIT_ZMM_Hi256)
-#define XSTATE_Hi16_ZMM		(1 << XSTATE_BIT_Hi16_ZMM)
+#define XFEATURE_MASK_FP		(1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE		(1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM		(1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS		(1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR		(1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK		(1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256		(1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 
-#define XSTATE_FPSSE		(XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512		(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+#define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
+					 | XFEATURE_MASK_ZMM_Hi256 \
+					 | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE	XFEATURE_YMM
+
+struct reg_128_bit {
+	u8      regbytes[128/8];
+};
+struct reg_256_bit {
+	u8	regbytes[256/8];
+};
+struct reg_512_bit {
+	u8	regbytes[512/8];
+};
 
 /*
+ * State component 2:
+ *
  * There are 16x 256-bit AVX registers named YMM0-YMM15.
  * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
- * and are stored in 'struct fxregs_state::xmm_space[]'.
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
  *
- * The high 128 bits are stored here:
- *    16x 128 bits == 256 bytes.
+ * The high 128 bits are stored here.
  */
 struct ymmh_struct {
-	u8				ymmh_space[256];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8				reserved[128];
-};
+	struct reg_128_bit              hi_ymm[16];
+} __packed;
 
 /* Intel MPX support: */
-struct bndreg {
+
+struct mpx_bndreg {
 	u64				lower_bound;
 	u64				upper_bound;
 } __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+	struct mpx_bndreg		bndreg[4];
+} __packed;
 
-struct bndcsr {
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS.  We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
 	u64				bndcfgu;
 	u64				bndstatus;
 } __packed;
 
-struct mpx_struct {
-	struct bndreg			bndreg[4];
-	struct bndcsr			bndcsr;
-};
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+	union {
+		struct mpx_bndcsr		bndcsr;
+		u8				pad_to_64_bytes[64];
+	};
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+	u64				opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+	struct reg_256_bit		zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+	struct reg_512_bit		hi16_zmm[16];
+} __packed;
 
 struct xstate_header {
 	u64				xfeatures;
@@ -159,22 +218,19 @@ struct xstate_header {
 	u64				reserved[6];
 } __attribute__((packed));
 
-/* New processor state extensions should be added here: */
-#define XSTATE_RESERVE			(sizeof(struct ymmh_struct) + \
-					 sizeof(struct lwp_struct)  + \
-					 sizeof(struct mpx_struct)  )
 /*
  * This is our most modern FPU state format, as saved by the XSAVE
  * and restored by the XRSTOR instructions.
  *
  * It consists of a legacy fxregs portion, an xstate header and
- * subsequent fixed size areas as defined by the xstate header.
- * Not all CPUs support all the extensions.
+ * subsequent areas as defined by the xstate header.  Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
  */
 struct xregs_state {
 	struct fxregs_state		i387;
 	struct xstate_header		header;
-	u8				__reserved[XSTATE_RESERVE];
+	u8				extended_state_area[0];
 } __attribute__ ((packed, aligned (64)));
 
 /*
@@ -182,7 +238,9 @@ struct xregs_state {
  * put together, so that we can pick the right one runtime.
  *
  * The size of the structure is determined by the largest
- * member - which is the xsave area:
+ * member - which is the xsave area.  The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
  */
 union fpregs_state {
 	struct fregs_state		fsave;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 4656b25bb9a7..3a6c89b70307 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,7 +6,7 @@
 #include <linux/uaccess.h>
 
 /* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
+#define XFEATURE_MASK_EXTEND	(~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
 
 #define XSTATE_CPUID		0x0000000d
 
@@ -19,14 +19,18 @@
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /* Supported features which support lazy state saving */
-#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
-			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+#define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
+				 XFEATURE_MASK_SSE | \
+				 XFEATURE_MASK_YMM | \
+				 XFEATURE_MASK_OPMASK |	\
+				 XFEATURE_MASK_ZMM_Hi256 | \
+				 XFEATURE_MASK_Hi16_ZMM)
 
 /* Supported features which require eager state saving */
-#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
 
 /* All currently supported features */
-#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
+#define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
@@ -40,6 +44,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 
+void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
index 173dd3ba108c..0f492fc50bce 100644
--- a/arch/x86/include/asm/trace/mpx.h
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -11,7 +11,7 @@
 TRACE_EVENT(mpx_bounds_register_exception,
 
 	TP_PROTO(void *addr_referenced,
-		 const struct bndreg *bndreg),
+		 const struct mpx_bndreg *bndreg),
 	TP_ARGS(addr_referenced, bndreg),
 
 	TP_STRUCT__entry(
@@ -44,7 +44,7 @@ TRACE_EVENT(mpx_bounds_register_exception,
 
 TRACE_EVENT(bounds_exception_mpx,
 
-	TP_PROTO(const struct bndcsr *bndcsr),
+	TP_PROTO(const struct mpx_bndcsr *bndcsr),
 	TP_ARGS(bndcsr),
 
 	TP_STRUCT__entry(
@@ -116,7 +116,8 @@ TRACE_EVENT(mpx_new_bounds_table,
 /*
  * This gets used outside of MPX-specific code, so we need a stub.
  */
-static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr)
+static inline
+void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
 {
 }
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d14e9ac3235a..be39b5fde4b9 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -290,11 +290,11 @@ static void __init fpu__init_system_ctx_switch(void)
 	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
-	if (xfeatures_mask & XSTATE_EAGER) {
+	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
 		if (eagerfpu == DISABLE) {
 			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
-			       xfeatures_mask & XSTATE_EAGER);
-			xfeatures_mask &= ~XSTATE_EAGER;
+			       xfeatures_mask & XFEATURE_MASK_EAGER);
+			xfeatures_mask &= ~XFEATURE_MASK_EAGER;
 		} else {
 			eagerfpu = ENABLE;
 		}
@@ -354,17 +354,7 @@ static int __init x86_noxsave_setup(char *s)
 	if (strlen(s))
 		return 0;
 
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	fpu__xstate_clear_all_cpu_caps();
 
 	return 1;
 }
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index dc60810c1c74..0bc3490420c5 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -66,7 +66,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 
 	return ret;
 }
@@ -326,7 +326,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
 	return ret;
 }
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 50ec9af1bd51..eb032677f939 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -107,7 +107,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	xfeatures |= XSTATE_FPSSE;
+	xfeatures |= XFEATURE_MASK_FPSSE;
 
 	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
 
@@ -207,7 +207,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 		 * layout and not enabled by the OS.
 		 */
 		if (fx_only)
-			header->xfeatures = XSTATE_FPSSE;
+			header->xfeatures = XFEATURE_MASK_FPSSE;
 		else
 			header->xfeatures &= (xfeatures_mask & xfeatures);
 	}
@@ -230,7 +230,7 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
 {
 	if (use_xsave()) {
 		if ((unsigned long)buf % 64 || fx_only) {
-			u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
 		} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 62fc001c7846..6454f2731b56 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -31,12 +31,28 @@ static const char *xfeature_names[] =
  */
 u64 xfeatures_mask __read_mostly;
 
-static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURES_NR_MAX]   = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
 
-/* The number of supported xfeatures in xfeatures_mask: */
-static unsigned int xfeatures_nr;
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+}
 
 /*
  * Return whether the system supports a given xfeature.
@@ -53,7 +69,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 		/*
 		 * So we use FLS here to be able to print the most advanced
 		 * feature that was requested but is missing. So if a driver
-		 * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
 		 * missing AVX feature - this is the most informative message
 		 * to users:
 		 */
@@ -112,7 +128,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	/*
 	 * FP is in init state
 	 */
-	if (!(xfeatures & XSTATE_FP)) {
+	if (!(xfeatures & XFEATURE_MASK_FP)) {
 		fx->cwd = 0x37f;
 		fx->swd = 0;
 		fx->twd = 0;
@@ -125,7 +141,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	/*
 	 * SSE is in init state
 	 */
-	if (!(xfeatures & XSTATE_SSE))
+	if (!(xfeatures & XFEATURE_MASK_SSE))
 		memset(&fx->xmm_space[0], 0, 256);
 
 	/*
@@ -168,26 +184,44 @@ void fpu__init_cpu_xstate(void)
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
 }
 
+/*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates.  For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+	return !!(xfeatures_mask & (1UL << xfeature));
+}
+
 /*
  * Record the offsets and sizes of various xstates contained
  * in the XSAVE state memory layout.
- *
- * ( Note that certain features might be non-present, for them
- *   we'll have 0 offset and 0 size. )
  */
 static void __init setup_xstate_features(void)
 {
-	u32 eax, ebx, ecx, edx, leaf;
+	u32 eax, ebx, ecx, edx, i;
+	/* start at the beginnning of the "extended state" */
+	unsigned int last_good_offset = offsetof(struct xregs_state,
+						 extended_state_area);
 
-	xfeatures_nr = fls64(xfeatures_mask);
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
 
-	for (leaf = 2; leaf < xfeatures_nr; leaf++) {
-		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		xstate_offsets[i] = ebx;
+		xstate_sizes[i] = eax;
+		/*
+		 * In our xstate size checks, we assume that the
+		 * highest-numbered xstate feature has the
+		 * highest offset in the buffer.  Ensure it does.
+		 */
+		WARN_ONCE(last_good_offset > xstate_offsets[i],
+			"x86/fpu: misordered xstate at %d\n", last_good_offset);
+		last_good_offset = xstate_offsets[i];
 
-		xstate_offsets[leaf] = ebx;
-		xstate_sizes[leaf] = eax;
-
-		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
 	}
 }
 
@@ -204,14 +238,14 @@ static void __init print_xstate_feature(u64 xstate_mask)
  */
 static void __init print_xstate_features(void)
 {
-	print_xstate_feature(XSTATE_FP);
-	print_xstate_feature(XSTATE_SSE);
-	print_xstate_feature(XSTATE_YMM);
-	print_xstate_feature(XSTATE_BNDREGS);
-	print_xstate_feature(XSTATE_BNDCSR);
-	print_xstate_feature(XSTATE_OPMASK);
-	print_xstate_feature(XSTATE_ZMM_Hi256);
-	print_xstate_feature(XSTATE_Hi16_ZMM);
+	print_xstate_feature(XFEATURE_MASK_FP);
+	print_xstate_feature(XFEATURE_MASK_SSE);
+	print_xstate_feature(XFEATURE_MASK_YMM);
+	print_xstate_feature(XFEATURE_MASK_BNDREGS);
+	print_xstate_feature(XFEATURE_MASK_BNDCSR);
+	print_xstate_feature(XFEATURE_MASK_OPMASK);
+	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 }
 
 /*
@@ -233,8 +267,8 @@ static void __init setup_xstate_comp(void)
 	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
 
 	if (!cpu_has_xsaves) {
-		for (i = 2; i < xfeatures_nr; i++) {
-			if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+			if (xfeature_enabled(i)) {
 				xstate_comp_offsets[i] = xstate_offsets[i];
 				xstate_comp_sizes[i] = xstate_sizes[i];
 			}
@@ -242,15 +276,16 @@ static void __init setup_xstate_comp(void)
 		return;
 	}
 
-	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+		FXSAVE_SIZE + XSAVE_HDR_SIZE;
 
-	for (i = 2; i < xfeatures_nr; i++) {
-		if (test_bit(i, (unsigned long *)&xfeatures_mask))
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (xfeature_enabled(i))
 			xstate_comp_sizes[i] = xstate_sizes[i];
 		else
 			xstate_comp_sizes[i] = 0;
 
-		if (i > 2)
+		if (i > FIRST_EXTENDED_XFEATURE)
 			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
 					+ xstate_comp_sizes[i-1];
 
@@ -290,27 +325,280 @@ static void __init setup_init_fpu_buf(void)
 	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
 }
 
-/*
- * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
- */
-static void __init init_xstate_size(void)
+static int xfeature_is_supervisor(int xfeature_nr)
 {
-	unsigned int eax, ebx, ecx, edx;
+	/*
+	 * We currently do not support supervisor states, but if
+	 * we did, we could find out like this.
+	 *
+	 * SDM says: If state component i is a user state component,
+	 * ECX[0] return 0; if state component i is a supervisor
+	 * state component, ECX[0] returns 1.
+	u32 eax, ebx, ecx, edx;
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx;
+	return !!(ecx & 1);
+	*/
+	return 0;
+}
+/*
+static int xfeature_is_user(int xfeature_nr)
+{
+	return !xfeature_is_supervisor(xfeature_nr);
+}
+*/
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do {		\
+	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
+	WARN_ON(nr >= XFEATURE_MAX);	\
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	/*
+	 * The value returned by ECX[1] indicates the alignment
+	 * of state component i when the compacted format
+	 * of the extended region of an XSAVE area is used
+	 */
+	return !!(ecx & 2);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+static int using_compacted_format(void)
+{
+	return cpu_has_xsaves;
+}
+
+static void __xstate_dump_leaves(void)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+	static int should_dump = 1;
+
+	if (!should_dump)
+		return;
+	should_dump = 0;
+	/*
+	 * Dump out a few leaves past the ones that we support
+	 * just in case there are some goodies up there
+	 */
+	for (i = 0; i < XFEATURE_MAX + 10; i++) {
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+			XSTATE_CPUID, i, eax, ebx, ecx, edx);
+	}
+}
+
+#define XSTATE_WARN_ON(x) do {							\
+	if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {	\
+		__xstate_dump_leaves();						\
+	}									\
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {			\
+	if ((nr == nr_macro) &&						\
+	    WARN_ONCE(sz != sizeof(__struct),				\
+		"%s: struct is %zu bytes, cpu state %d bytes\n",	\
+		__stringify(nr_macro), sizeof(__struct), sz)) {		\
+		__xstate_dump_leaves();					\
+	}								\
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'.  We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+	/*
+	 * Ask the CPU for the size of the state.
+	 */
+	int sz = xfeature_size(nr);
+	/*
+	 * Match each CPU state with the corresponding software
+	 * structure.
+	 */
+	XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
+	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
+	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
+
+	/*
+	 * Make *SURE* to add any feature numbers in below if
+	 * there are "holes" in the xsave state component
+	 * numbers.
+	 */
+	if ((nr < XFEATURE_YMM) ||
+	    (nr >= XFEATURE_MAX)) {
+		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+		XSTATE_WARN_ON(1);
+	}
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be.  We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
 
-	if (!cpu_has_xsaves) {
-		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-		xstate_size = ebx;
-		return;
-	}
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
 
-	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	for (i = 2; i < 64; i++) {
-		if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
-			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-			xstate_size += eax;
-		}
+		check_xstate_against_struct(i);
+		/*
+		 * Supervisor state components can be managed only by
+		 * XSAVES, which is compacted-format only.
+		 */
+		if (!using_compacted_format())
+			XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+		/* Align from the end of the previous feature */
+		if (xfeature_is_aligned(i))
+			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+		/*
+		 * The offset of a given state in the non-compacted
+		 * format is given to us in a CPUID leaf.  We check
+		 * them for being ordered (increasing offsets) in
+		 * setup_xstate_features().
+		 */
+		if (!using_compacted_format())
+			paranoid_xstate_size = xfeature_uncompacted_offset(i);
+		/*
+		 * The compacted-format offset always depends on where
+		 * the previous state ended.
+		 */
+		paranoid_xstate_size += xfeature_size(i);
 	}
+	XSTATE_WARN_ON(paranoid_xstate_size != xstate_size);
+}
+
+/*
+ * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here.  "sub-function 0" only enumerates
+ * the size of the *user* states.  If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
+ */
+static unsigned int __init calculate_xstate_size(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned int calculated_xstate_size;
+
+	if (!cpu_has_xsaves) {
+		/*
+		 * - CPUID function 0DH, sub-function 0:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVE instruction for an XSAVE area
+		 *    containing all the *user* state components
+		 *    corresponding to bits currently set in XCR0.
+		 */
+		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+		calculated_xstate_size = ebx;
+	} else {
+		/*
+		 * - CPUID function 0DH, sub-function 1:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVES instruction for an XSAVE area
+		 *    containing all the state components
+		 *    corresponding to bits currently set in
+		 *    XCR0 | IA32_XSS.
+		 */
+		cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+		calculated_xstate_size = ebx;
+	}
+	return calculated_xstate_size;
+}
+
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+	if (test_xstate_size <= sizeof(union fpregs_state))
+		return true;
+
+	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+			sizeof(union fpregs_state), test_xstate_size);
+	return false;
+}
+
+static int init_xstate_size(void)
+{
+	/* Recompute the context size for enabled features: */
+	unsigned int possible_xstate_size = calculate_xstate_size();
+
+	/* Ensure we have the space to store all enabled: */
+	if (!is_supported_xstate_size(possible_xstate_size))
+		return -EINVAL;
+
+	/*
+	 * The size is OK, we are definitely going to use xsave,
+	 * make it known to the world that we need more space.
+	 */
+	xstate_size = possible_xstate_size;
+	do_extra_xstate_size_checks();
+	return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it.  Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+	xfeatures_mask = 0;
+	cr4_clear_bits(X86_CR4_OSXSAVE);
+	fpu__xstate_clear_all_cpu_caps();
 }
 
 /*
@@ -321,6 +609,7 @@ void __init fpu__init_system_xstate(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu = 1;
+	int err;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -338,7 +627,7 @@ void __init fpu__init_system_xstate(void)
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xfeatures_mask = eax + ((u64)edx << 32);
 
-	if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
 		BUG();
 	}
@@ -348,16 +637,19 @@ void __init fpu__init_system_xstate(void)
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
-
-	/* Recompute the context size for enabled features: */
-	init_xstate_size();
+	err = init_xstate_size();
+	if (err) {
+		/* something went wrong, boot without any XSAVE support */
+		fpu__init_disable_system_xstate();
+		return;
+	}
 
 	update_regset_xstate_info(xstate_size, xfeatures_mask);
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
 	setup_xstate_comp();
 
-	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 		xfeatures_mask,
 		xstate_size,
 		cpu_has_xsaves ? "compacted" : "standard");
@@ -388,7 +680,7 @@ void fpu__resume_cpu(void)
  * Inputs:
  *	xstate: the thread's storage area for all FPU data
  *	xstate_feature: state which is defined in xsave.h (e.g.
- *	XSTATE_FP, XSTATE_SSE, etc...)
+ *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
  * Output:
  *	address of the state in the xsave area, or NULL if the
  *	field is not present in the xsave buffer.
@@ -439,8 +731,8 @@ EXPORT_SYMBOL_GPL(get_xsave_addr);
  * Note that this only works on the current task.
  *
  * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP,
- *	XSTATE_SSE, etc...)
+ *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ *	XFEATURE_MASK_SSE, etc...)
  * Output:
  *	address of the state in the xsave area or NULL if the state
  *	is not present or is in its 'init state'.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..ade185a46b1d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 	siginfo_t *info;
 
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 * which is all zeros which indicates MPX was not
 	 * responsible for the exception.
 	 */
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea2544f24..156441bcaac8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -30,7 +30,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
-	xstate_bv &= XSTATE_EXTEND_MASK;
+	xstate_bv &= XFEATURE_MASK_EXTEND;
 	while (xstate_bv) {
 		if (xstate_bv & 0x1) {
 		        u32 eax, ebx, ecx, edx, offset;
@@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void)
 	u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
 
 	if (!kvm_x86_ops->mpx_supported())
-		xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+		xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
 	return xcr0;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a9a19830321..bda65690788e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -663,9 +663,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 	if (index != XCR_XFEATURE_ENABLED_MASK)
 		return 1;
-	if (!(xcr0 & XSTATE_FP))
+	if (!(xcr0 & XFEATURE_MASK_FP))
 		return 1;
-	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
 		return 1;
 
 	/*
@@ -673,23 +673,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	 * saving.  However, xcr0 bit 0 is always set, even if the
 	 * emulated CPU does not support XSAVE (see fx_init).
 	 */
-	valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
 	if (xcr0 & ~valid_bits)
 		return 1;
 
-	if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
 		return 1;
 
-	if (xcr0 & XSTATE_AVX512) {
-		if (!(xcr0 & XSTATE_YMM))
+	if (xcr0 & XFEATURE_MASK_AVX512) {
+		if (!(xcr0 & XFEATURE_MASK_YMM))
 			return 1;
-		if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->arch.xcr0 = xcr0;
 
-	if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
 		kvm_update_cpuid(vcpu);
 	return 0;
 }
@@ -2905,7 +2906,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 * Copy each region from the possibly compacted offset to the
 	 * non-compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -2943,7 +2944,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	 * Copy each region from the non-compacted offset to the
 	 * possibly compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -2971,7 +2972,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 			&vcpu->arch.guest_fpu.state.fxsave,
 			sizeof(struct fxregs_state));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-			XSTATE_FPSSE;
+			XFEATURE_MASK_FPSSE;
 	}
 }
 
@@ -2991,7 +2992,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		load_xsave(vcpu, (u8 *)guest_xsave->region);
 	} else {
-		if (xstate_bv & ~XSTATE_FPSSE)
+		if (xstate_bv & ~XFEATURE_MASK_FPSSE)
 			return -EINVAL;
 		memcpy(&vcpu->arch.guest_fpu.state.fxsave,
 			guest_xsave->region, sizeof(struct fxregs_state));
@@ -7005,7 +7006,7 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
-	vcpu->arch.xcr0 = XSTATE_FP;
+	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2f822cd886c2..f2afa5fe48a6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -180,9 +180,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
 
-#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
-				| XSTATE_BNDREGS | XSTATE_BNDCSR \
-				| XSTATE_AVX512)
+#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index dd76a05729b0..024f6e971174 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -169,6 +169,76 @@ void fxch_i(void)
 	fpu_tag_word = tag_word;
 }
 
+static void fcmovCC(void)
+{
+	/* fcmovCC st(i) */
+	int i = FPU_rm;
+	FPU_REG *st0_ptr = &st(0);
+	FPU_REG *sti_ptr = &st(i);
+	long tag_word = fpu_tag_word;
+	int regnr = top & 7;
+	int regnri = (top + i) & 7;
+	u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+	if (sti_tag == TAG_Empty) {
+		FPU_stack_underflow();
+		clear_C1();
+		return;
+	}
+	reg_copy(sti_ptr, st0_ptr);
+	tag_word &= ~(3 << (regnr * 2));
+	tag_word |= (sti_tag << (regnr * 2));
+	fpu_tag_word = tag_word;
+}
+
+void fcmovb(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_CF)
+		fcmovCC();
+}
+
+void fcmove(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_ZF)
+		fcmovCC();
+}
+
+void fcmovbe(void)
+{
+	if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovu(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_PF)
+		fcmovCC();
+}
+
+void fcmovnb(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_CF))
+		fcmovCC();
+}
+
+void fcmovne(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovnbe(void)
+{
+	if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)))
+		fcmovCC();
+}
+
+void fcmovnu(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_PF))
+		fcmovCC();
+}
+
 void ffree_(void)
 {
 	/* ffree st(i) */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 3d8f2e421466..e945fedf1de2 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -40,49 +40,33 @@
 
 #define __BAD__ FPU_illegal	/* Illegal on an 80486, causes SIGILL */
 
-#ifndef NO_UNDOC_CODE		/* Un-documented FPU op-codes supported by default. */
+/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */
 
-/* WARNING: These codes are not documented by Intel in their 80486 manual
-   and may not work on FPU clones or later Intel FPUs. */
-
-/* Changes to support the un-doc codes provided by Linus Torvalds. */
-
-#define _d9_d8_ fstp_i		/* unofficial code (19) */
-#define _dc_d0_ fcom_st		/* unofficial code (14) */
-#define _dc_d8_ fcompst		/* unofficial code (1c) */
-#define _dd_c8_ fxch_i		/* unofficial code (0d) */
-#define _de_d0_ fcompst		/* unofficial code (16) */
-#define _df_c0_ ffreep		/* unofficial code (07) ffree + pop */
-#define _df_c8_ fxch_i		/* unofficial code (0f) */
-#define _df_d0_ fstp_i		/* unofficial code (17) */
-#define _df_d8_ fstp_i		/* unofficial code (1f) */
+/* WARNING: "u" entries are not documented by Intel in their 80486 manual
+   and may not work on FPU clones or later Intel FPUs.
+   Changes to support them provided by Linus Torvalds. */
 
 static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
-	fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
-	fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
+/* Opcode:	d8		d9		da		db */
+/*		dc		dd		de		df */
+/* c0..7 */	fadd__,		fld_i_,		fcmovb,		fcmovnb,
+/* c0..7 */	fadd_i,		ffree_,		faddp_,		ffreep,/*u*/
+/* c8..f */	fmul__,		fxch_i,		fcmove,		fcmovne,
+/* c8..f */	fmul_i,		fxch_i,/*u*/	fmulp_,		fxch_i,/*u*/
+/* d0..7 */	fcom_st,	fp_nop,		fcmovbe,	fcmovnbe,
+/* d0..7 */	fcom_st,/*u*/	fst_i_,		fcompst,/*u*/	fstp_i,/*u*/
+/* d8..f */	fcompst,	fstp_i,/*u*/	fcmovu,		fcmovnu,
+/* d8..f */	fcompst,/*u*/	fstp_i,		fcompp,		fstp_i,/*u*/
+/* e0..7 */	fsub__,		FPU_etc,	__BAD__,	finit_,
+/* e0..7 */	fsubri,		fucom_,		fsubrp,		fstsw_,
+/* e8..f */	fsubr_,		fconst,		fucompp,	fucomi_,
+/* e8..f */	fsub_i,		fucomp,		fsubp_,		fucomip,
+/* f0..7 */	fdiv__,		FPU_triga,	__BAD__,	fcomi_,
+/* f0..7 */	fdivri,		__BAD__,	fdivrp,		fcomip,
+/* f8..f */	fdivr_,		FPU_trigb,	__BAD__,	__BAD__,
+/* f8..f */	fdiv_i,		__BAD__,	fdivp_,		__BAD__,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
-	fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
-	fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #define _NONE_ 0		/* Take no special action */
 #define _REG0_ 1		/* Need to check for not empty st(0) */
 #define _REGI_ 2		/* Need to check for not empty st(0) and st(rm) */
@@ -94,36 +78,18 @@ static FUNC const st_instr_table[64] = {
 #define _REGIc 0		/* Compare st(0) and st(rm) */
 #define _REGIn 0		/* Uses st(0) and st(rm), but handle checks later */
 
-#ifndef NO_UNDOC_CODE
-
-/* Un-documented FPU op-codes supported by default. (see above) */
-
 static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
-	_REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+/* Opcode:	d8	d9	da	db	dc	dd	de	df */
+/* c0..7 */	_REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_,
+/* c8..f */	_REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_,
+/* d0..7 */	_REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* d8..f */	_REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* e0..7 */	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+/* e8..f */	_REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc,
+/* f0..7 */	_REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc,
+/* f8..f */	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
-	_REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #ifdef RE_ENTRANT_CHECKING
 u_char emulating = 0;
 #endif /* RE_ENTRANT_CHECKING */
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 9779df436b7d..caff438b9c1d 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -46,6 +46,14 @@ extern void fstsw_(void);
 extern void fp_nop(void);
 extern void fld_i_(void);
 extern void fxch_i(void);
+extern void fcmovb(void);
+extern void fcmove(void);
+extern void fcmovbe(void);
+extern void fcmovu(void);
+extern void fcmovnb(void);
+extern void fcmovne(void);
+extern void fcmovnbe(void);
+extern void fcmovnu(void);
 extern void ffree_(void);
 extern void ffreep(void);
 extern void fst_i_(void);
@@ -108,6 +116,10 @@ extern void fcompp(void);
 extern void fucom_(void);
 extern void fucomp(void);
 extern void fucompp(void);
+extern void fcomi_(void);
+extern void fcomip(void);
+extern void fucomi_(void);
+extern void fucomip(void);
 /* reg_constant.c */
 extern void fconst(void);
 /* reg_ld_str.c */
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index 2931ff355218..95228ff042c0 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -33,11 +33,12 @@
 
 #define pop_0()	{ FPU_settag0(TAG_Empty); top++; }
 
+/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
 static u_char const type_table[32] = {
-	_PUSH_, _PUSH_, _PUSH_, _PUSH_,
-	_null_, _null_, _null_, _null_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
+	_PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32,  db:fild m32,  dd:fld f64,  df:fild m16 */
+	_null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef,    db,dd,df:fisttp m32/64/16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32,  db:fist m32,  dd:fst f64,  df:fist m16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */
 	_NONE_, _null_, _NONE_, _PUSH_,
 	_NONE_, _PUSH_, _null_, _PUSH_,
 	_NONE_, _null_, _NONE_, _REG0_,
@@ -45,15 +46,19 @@ static u_char const type_table[32] = {
 };
 
 u_char const data_sizes_16[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	14, 0, 94, 10, 2, 10, 0, 8,
 	14, 0, 94, 10, 2, 10, 2, 8
 };
 
 static u_char const data_sizes_32[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	28, 0, 108, 10, 2, 10, 0, 8,
 	28, 0, 108, 10, 2, 10, 2, 8
 };
@@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	FPU_REG *st0_ptr;
 	u_char st0_tag = TAG_Empty;	/* This is just to stop a gcc warning. */
 	u_char loaded_tag;
+	int sv_cw;
 
 	st0_ptr = NULL;		/* Initialized just to stop compiler warnings. */
 
@@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	}
 
 	switch (type) {
-	case 000:		/* fld m32real */
+	/* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+	case 000:		/* fld m32real (d9 /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_single((float __user *)data_address, &loaded_data);
@@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 001:		/* fild m32int */
+	case 001:		/* fild m32int (db /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int32((long __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 002:		/* fld m64real */
+	case 002:		/* fld m64real (dd /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_double((double __user *)data_address,
@@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 003:		/* fild m16int */
+	case 003:		/* fild m16int (df /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int16((short __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
+	/* case 004: undefined (d9 /1) */
+	/* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */
+	case 005:		/* fisttp m32int (db /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int32
+		    (st0_ptr, st0_tag, (long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 006:		/* fisttp m64int (dd /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int64
+		    (st0_ptr, st0_tag, (long long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 007:		/* fisttp m16int (df /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int16
+		    (st0_ptr, st0_tag, (short __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
 	case 010:		/* fst m32real */
 		clear_C1();
 		FPU_store_single(st0_ptr, st0_tag,
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index ecce55fc2e2e..b77360fdbf4a 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -249,6 +249,54 @@ static int compare_st_st(int nr)
 	return 0;
 }
 
+static int compare_i_st_st(int nr)
+{
+	int f, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		EXCEPTION(EX_Invalid);
+		return !(control_word & CW_Invalid);
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x122);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 static int compare_u_st_st(int nr)
 {
 	int f = 0, c;
@@ -299,6 +347,58 @@ static int compare_u_st_st(int nr)
 	return 0;
 }
 
+static int compare_ui_st_st(int nr)
+{
+	int f = 0, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		if (c & COMP_SNaN) {	/* This is the only difference between
+					   un-ordered and ordinary comparisons */
+			EXCEPTION(EX_Invalid);
+			return !(control_word & CW_Invalid);
+		}
+		return 0;
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x123);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 /*---------------------------------------------------------------------------*/
 
 void fcom_st(void)
@@ -348,3 +448,31 @@ void fucompp(void)
 	} else
 		FPU_illegal();
 }
+
+/* P6+ compare-to-EFLAGS ops */
+
+void fcomi_(void)
+{
+	/* fcomi st(i) */
+	compare_i_st_st(FPU_rm);
+}
+
+void fcomip(void)
+{
+	/* fcomip st(i) */
+	if (!compare_i_st_st(FPU_rm))
+		FPU_pop();
+}
+
+void fucomi_(void)
+{
+	/* fucomi st(i) */
+	compare_ui_st_st(FPU_rm);
+}
+
+void fucomip(void)
+{
+	/* fucomip st(i) */
+	if (!compare_ui_st_st(FPU_rm))
+		FPU_pop();
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 134948b0926f..b0ae85f90f10 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -237,7 +237,8 @@ bad_opcode:
  */
 siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
-	const struct bndreg *bndregs, *bndreg;
+	const struct mpx_bndreg_state *bndregs;
+	const struct mpx_bndreg *bndreg;
 	siginfo_t *info = NULL;
 	struct insn insn;
 	uint8_t bndregno;
@@ -258,13 +259,13 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 		goto err_out;
 	}
 	/* get bndregs field from current task's xsave area */
-	bndregs = get_xsave_field_ptr(XSTATE_BNDREGS);
+	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
 	}
 	/* now go select the individual register in the set of 4 */
-	bndreg = &bndregs[bndregno];
+	bndreg = &bndregs->bndreg[bndregno];
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
@@ -306,7 +307,7 @@ err_out:
 
 static __user void *mpx_get_bounds_dir(void)
 {
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX))
 		return MPX_INVALID_BOUNDS_DIR;
@@ -315,7 +316,7 @@ static __user void *mpx_get_bounds_dir(void)
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -489,10 +490,10 @@ out_unmap:
 static int do_mpx_bt_fault(void)
 {
 	unsigned long bd_entry, bd_base;
-	const struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 389701f59940..eabcff411984 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,7 +5,8 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
+			test_FCMOV test_FCOMI test_FISTTP
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
@@ -35,7 +36,7 @@ clean:
 	$(RM) $(BINARIES_32) $(BINARIES_64)
 
 $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
-	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
 
 $(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
 	$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/x86/test_FCMOV.c b/tools/testing/selftests/x86/test_FCMOV.c
new file mode 100644
index 000000000000..4adcca0c80c4
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCMOV.c
@@ -0,0 +1,93 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#define TEST(insn) \
+long double __attribute__((noinline)) insn(long flags) \
+{						\
+	long double out;			\
+	asm ("\n"				\
+	"	push	%1""\n"			\
+	"	popf""\n"			\
+	"	fldpi""\n"			\
+	"	fld1""\n"			\
+	"	" #insn " %%st(1), %%st" "\n"	\
+	"	ffree	%%st(1)" "\n"		\
+	: "=t" (out)				\
+	: "r" (flags)				\
+	);					\
+	return out;				\
+}
+
+TEST(fcmovb)
+TEST(fcmove)
+TEST(fcmovbe)
+TEST(fcmovu)
+TEST(fcmovnb)
+TEST(fcmovne)
+TEST(fcmovnbe)
+TEST(fcmovnu)
+
+enum {
+	CF = 1 << 0,
+	PF = 1 << 2,
+	ZF = 1 << 6,
+};
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting fcmovCC instructions\n");
+	/* If fcmovCC() returns 1.0, the move wasn't done */
+	err |= !(fcmovb(0)   == 1.0); err |= !(fcmovnb(0)  != 1.0);
+	err |= !(fcmove(0)   == 1.0); err |= !(fcmovne(0)  != 1.0);
+	err |= !(fcmovbe(0)  == 1.0); err |= !(fcmovnbe(0) != 1.0);
+	err |= !(fcmovu(0)   == 1.0); err |= !(fcmovnu(0)  != 1.0);
+
+	err |= !(fcmovb(CF)  != 1.0); err |= !(fcmovnb(CF)  == 1.0);
+	err |= !(fcmove(CF)  == 1.0); err |= !(fcmovne(CF)  != 1.0);
+	err |= !(fcmovbe(CF) != 1.0); err |= !(fcmovnbe(CF) == 1.0);
+	err |= !(fcmovu(CF)  == 1.0); err |= !(fcmovnu(CF)  != 1.0);
+
+	err |= !(fcmovb(ZF)  == 1.0); err |= !(fcmovnb(ZF)  != 1.0);
+	err |= !(fcmove(ZF)  != 1.0); err |= !(fcmovne(ZF)  == 1.0);
+	err |= !(fcmovbe(ZF) != 1.0); err |= !(fcmovnbe(ZF) == 1.0);
+	err |= !(fcmovu(ZF)  == 1.0); err |= !(fcmovnu(ZF)  != 1.0);
+
+	err |= !(fcmovb(PF)  == 1.0); err |= !(fcmovnb(PF)  != 1.0);
+	err |= !(fcmove(PF)  == 1.0); err |= !(fcmovne(PF)  != 1.0);
+	err |= !(fcmovbe(PF) == 1.0); err |= !(fcmovnbe(PF) != 1.0);
+	err |= !(fcmovu(PF)  != 1.0); err |= !(fcmovnu(PF)  == 1.0);
+
+        if (!err)
+                printf("[OK]\tfcmovCC\n");
+	else
+		printf("[FAIL]\tfcmovCC errors: %d\n", err);
+
+	return err;
+}
diff --git a/tools/testing/selftests/x86/test_FCOMI.c b/tools/testing/selftests/x86/test_FCOMI.c
new file mode 100644
index 000000000000..db4933e31af9
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCOMI.c
@@ -0,0 +1,331 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+enum {
+	CF = 1 << 0,
+	PF = 1 << 2,
+	ZF = 1 << 6,
+	ARITH = CF | PF | ZF,
+};
+
+long res_fcomi_pi_1;
+long res_fcomi_1_pi;
+long res_fcomi_1_1;
+long res_fcomi_nan_1;
+/* sNaN is s|111 1111 1|1xx xxxx xxxx xxxx xxxx xxxx */
+/* qNaN is s|111 1111 1|0xx xxxx xxxx xxxx xxxx xxxx (some x must be nonzero) */
+int snan = 0x7fc11111;
+int qnan = 0x7f811111;
+unsigned short snan1[5];
+/* sNaN80 is s|111 1111 1111 1111 |10xx xx...xx (some x must be nonzero) */
+unsigned short snan80[5] = { 0x1111, 0x1111, 0x1111, 0x8111, 0x7fff };
+
+int test(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fldpi""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_pi""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fldpi""\n"
+	"	fld1""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_pi_1""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fld1""\n"
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_1_pi & ARITH) != (0)) {
+		printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+		return 1;
+	}
+	if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+		printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+		return 1;
+	}
+	if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+		printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int test_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fcomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testu_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fucomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testu_snan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+//	"	flds	snan""\n"	// WRONG, this will convert 32-bit fp snan to a *qnan* in 80-bit fp register!
+//	"	fstpt	snan1""\n"	// if uncommented, it prints "snan1:7fff c111 1100 0000 0000" - c111, not 8111!
+//	"	fnclex""\n"		// flds of a snan raised FE_INVALID, clear it
+	"	fldt	snan80""\n"	// fldt never raise FE_INVALID
+	"	fld1""\n"
+	"	fucomi	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	ffree	%%st(1)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+//	printf("snan:%x snan1:%04x %04x %04x %04x %04x\n", snan, snan1[4], snan1[3], snan1[2], snan1[1], snan1[0]);
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testp(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fldpi""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_pi""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fldpi""\n"
+	"	fld1""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_pi_1""\n"
+
+	"	push	%0""\n"
+	"	popf""\n"
+	"	fld1""\n"
+	"	fld1""\n"
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_1_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_1_pi & ARITH) != (0)) {
+		printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+		return 1;
+	}
+	if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+		printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+		return 1;
+	}
+	if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+		printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testp_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fcomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != FE_INVALID) {
+		printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+int testup_qnan(long flags)
+{
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+	asm ("\n"
+	"	push	%0""\n"
+	"	popf""\n"
+	"	flds	qnan""\n"
+	"	fld1""\n"
+	"	fnclex""\n"		// fld of a qnan raised FE_INVALID, clear it
+	"	fucomip	%%st(1), %%st" "\n"
+	"	ffree	%%st(0)" "\n"
+	"	pushf""\n"
+	"	pop	res_fcomi_nan_1""\n"
+	:
+	: "r" (flags)
+	);
+	if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+		printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+		return 1;
+	}
+	if (fetestexcept(FE_INVALID) != 0) {
+		printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+		return 1;
+	}
+	return 0;
+}
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting f[u]comi[p] instructions\n");
+	err |= test(0);
+	err |= test_qnan(0);
+	err |= testu_qnan(0);
+	err |= testu_snan(0);
+	err |= test(CF|ZF|PF);
+	err |= test_qnan(CF|ZF|PF);
+	err |= testu_qnan(CF|ZF|PF);
+	err |= testu_snan(CF|ZF|PF);
+	err |= testp(0);
+	err |= testp_qnan(0);
+	err |= testup_qnan(0);
+	err |= testp(CF|ZF|PF);
+	err |= testp_qnan(CF|ZF|PF);
+	err |= testup_qnan(CF|ZF|PF);
+	if (!err)
+		printf("[OK]\tf[u]comi[p]\n");
+	else
+		printf("[FAIL]\tf[u]comi[p] errors: %d\n", err);
+
+	return err;
+}
diff --git a/tools/testing/selftests/x86/test_FISTTP.c b/tools/testing/selftests/x86/test_FISTTP.c
new file mode 100644
index 000000000000..b8e61a047f6b
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FISTTP.c
@@ -0,0 +1,137 @@
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+unsigned long long res64 = -1;
+unsigned int res32 = -1;
+unsigned short res16 = -1;
+
+int test(void)
+{
+	int ex;
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fld1""\n"
+	"	fisttp	res16""\n"
+	"	fld1""\n"
+	"	fisttpl	res32""\n"
+	"	fld1""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 1 || res32 != 1 || res64 != 1) {
+		printf("[BAD]\tfisttp 1\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != 0) {
+		printf("[BAD]\tfisttp 1: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldpi""\n"
+	"	fisttp	res16""\n"
+	"	fldpi""\n"
+	"	fisttpl	res32""\n"
+	"	fldpi""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 3 || res32 != 3 || res64 != 3) {
+		printf("[BAD]\tfisttp pi\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp pi: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttp	res16""\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttpl	res32""\n"
+	"	fldpi""\n"
+	"	fchs""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	if (res16 != 0xfffd || res32 != 0xfffffffd || res64 != 0xfffffffffffffffdULL) {
+		printf("[BAD]\tfisttp -pi\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp -pi: wrong exception state\n");
+		return 1;
+	}
+
+	feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	asm volatile ("\n"
+	"	fldln2""\n"
+	"	fisttp	res16""\n"
+	"	fldln2""\n"
+	"	fisttpl	res32""\n"
+	"	fldln2""\n"
+	"	fisttpll res64""\n"
+	: : : "memory"
+	);
+	/* Test truncation to zero (round-to-nearest would give 1 here) */
+	if (res16 != 0 || res32 != 0 || res64 != 0) {
+		printf("[BAD]\tfisttp ln2\n");
+		return 1;
+	}
+	ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+	if (ex != FE_INEXACT) {
+		printf("[BAD]\tfisttp ln2: wrong exception state\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+void sighandler(int sig)
+{
+	printf("[FAIL]\tGot signal %d, exiting\n", sig);
+	exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+	int err = 0;
+
+	/* SIGILL triggers on 32-bit kernels w/o fisttp emulation
+	 * when run with "no387 nofxsr". Other signals are caught
+	 * just in case.
+	 */
+	signal(SIGILL, sighandler);
+	signal(SIGFPE, sighandler);
+	signal(SIGSEGV, sighandler);
+
+	printf("[RUN]\tTesting fisttp instructions\n");
+	err |= test();
+	if (!err)
+		printf("[OK]\tfisttp\n");
+	else
+		printf("[FAIL]\tfisttp errors: %d\n", err);
+
+	return err;
+}