diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35bc3c3574c6..138f6664b2e2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,6 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - nofsgsbase [X86] Disables FSGSBASE instructions. - no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst index b87c1d816aea..a48b3f6ebbe8 100644 --- a/Documentation/x86/entry_64.rst +++ b/Documentation/x86/entry_64.rst @@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors that absolutely need the more expensive check for the GS base - and we generate all 'normal' entry points with the regular (faster) paranoid=0 variant. - -On a FSGSBASE system, however, user space can set GS without kernel -interaction. It means the value of GS base itself does not imply anything, -whether a kernel value or a user space value. So, there is no longer a safe -way to check whether the exception is entering from user mode or kernel -mode in the paranoid entry code path. So the GSBASE value needs to be read -out, saved and the kernel GSBASE value written. On exit the saved GSBASE -value needs to be restored unconditionally. The non paranoid entry/exit -code still uses SWAPGS unconditionally as the state is known. diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst deleted file mode 100644 index 380c0b5ccca2..000000000000 --- a/Documentation/x86/x86_64/fsgs.rst +++ /dev/null @@ -1,199 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Using FS and GS segments in user space applications -=================================================== - -The x86 architecture supports segmentation. Instructions which access -memory can use segment register based addressing mode. The following -notation is used to address a byte within a segment: - - Segment-register:Byte-address - -The segment base address is added to the Byte-address to compute the -resulting virtual address which is accessed. This allows to access multiple -instances of data with the identical Byte-address, i.e. the same code. The -selection of a particular instance is purely based on the base-address in -the segment register. - -In 32-bit mode the CPU provides 6 segments, which also support segment -limits. The limits can be used to enforce address space protections. - -In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is -always 0 to provide a full 64bit address space. The FS and GS segments are -still functional in 64-bit mode. - -Common FS and GS usage ------------------------------- - -The FS segment is commonly used to address Thread Local Storage (TLS). FS -is usually managed by runtime code or a threading library. Variables -declared with the '__thread' storage class specifier are instantiated per -thread and the compiler emits the FS: address prefix for accesses to these -variables. Each thread has its own FS base address so common code can be -used without complex address offset calculations to access the per thread -instances. Applications should not use FS for other purposes when they use -runtimes or threading libraries which manage the per thread FS. - -The GS segment has no common use and can be used freely by -applications. GCC and Clang support GS based addressing via address space -identifiers. - -Reading and writing the FS/GS base address ------------------------------------------- - -There exist two mechanisms to read and write the FS/FS base address: - - - the arch_prctl() system call - - - the FSGSBASE instruction family - -Accessing FS/GS base with arch_prctl() --------------------------------------- - - The arch_prctl(2) based mechanism is available on all 64bit CPUs and all - kernel versions. - - Reading the base: - - arch_prctl(ARCH_GET_FS, &fsbase); - arch_prctl(ARCH_GET_GS, &gsbase); - - Writing the base: - - arch_prctl(ARCH_SET_FS, fsbase); - arch_prctl(ARCH_SET_GS, gsbase); - - The ARCH_SET_GS prctl may be disabled depending on kernel configuration - and security settings. - -Accessing FS/GS base with the FSGSBASE instructions ---------------------------------------------------- - - With the Ivy Bridge CPU generation Intel introduced a new set of - instructions to access the FS and GS base registers directly from user - space. These instructions are also supported on AMD Family 17H CPUs. The - following instructions are available: - - =============== =========================== - RDFSBASE %reg Read the FS base register - RDGSBASE %reg Read the GS base register - WRFSBASE %reg Write the FS base register - WRGSBASE %reg Write the GS base register - =============== =========================== - - The instructions avoid the overhead of the arch_prctl() syscall and allow - more flexible usage of the FS/GS addressing modes in user space - applications. This does not prevent conflicts between threading libraries - and runtimes which utilize FS and applications which want to use it for - their own purpose. - -FSGSBASE instructions enablement -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If - available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. - - The availability of the instructions does not enable them - automatically. The kernel has to enable them explicitly in CR4. The - reason for this is that older kernels make assumptions about the values in - the GS register and enforce them when GS base is set via - arch_prctl(). Allowing user space to write arbitrary values to GS base - would violate these assumptions and cause malfunction. - - On kernels which do not enable FSGSBASE the execution of the FSGSBASE - instructions will fault with a #UD exception. - - The kernel provides reliable information about the enabled state in the - ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the - kernel has FSGSBASE instructions enabled and applications can use them. - The following code example shows how this detection works:: - - #include - #include - - /* Will be eventually in asm/hwcap.h */ - #ifndef HWCAP2_FSGSBASE - #define HWCAP2_FSGSBASE (1 << 1) - #endif - - .... - - unsigned val = getauxval(AT_HWCAP2); - - if (val & HWCAP2_FSGSBASE) - printf("FSGSBASE enabled\n"); - -FSGSBASE instructions compiler support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE -instructions. Clang supports them as well. - - =================== =========================== - _readfsbase_u64() Read the FS base register - _readfsbase_u64() Read the GS base register - _writefsbase_u64() Write the FS base register - _writegsbase_u64() Write the GS base register - =================== =========================== - -To utilize these instrinsics must be included in the source -code and the compiler option -mfsgsbase has to be added. - -Compiler support for FS/GS based addressing -------------------------------------------- - -GCC version 6 and newer provide support for FS/GS based addressing via -Named Address Spaces. GCC implements the following address space -identifiers for x86: - - ========= ==================================== - __seg_fs Variable is addressed relative to FS - __seg_gs Variable is addressed relative to GS - ========= ==================================== - -The preprocessor symbols __SEG_FS and __SEG_GS are defined when these -address spaces are supported. Code which implements fallback modes should -check whether these symbols are defined. Usage example:: - - #ifdef __SEG_GS - - long data0 = 0; - long data1 = 1; - - long __seg_gs *ptr; - - /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ - .... - - /* Set GS to point to data0 */ - _writegsbase_u64(&data0); - - /* Access offset 0 of GS */ - ptr = 0; - printf("data0 = %ld\n", *ptr); - - /* Set GS to point to data1 */ - _writegsbase_u64(&data1); - /* ptr still addresses offset 0! */ - printf("data1 = %ld\n", *ptr); - - -Clang does not provide the GCC address space identifiers, but it provides -address spaces via an attribute based mechanism in Clang 5 and newer -versions: - - ==================================== ===================================== - __attribute__((address_space(256)) Variable is addressed relative to GS - __attribute__((address_space(257)) Variable is addressed relative to FS - ==================================== ===================================== - -FS/GS based addressing with inline assembly -------------------------------------------- - -In case the compiler does not support address spaces, inline assembly can -be used for FS/GS based addressing mode:: - - mov %fs:offset, %reg - mov %gs:offset, %reg - - mov %reg, %fs:offset - mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index a56070fc8e77..d6eaaa5a35fc 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,4 +14,3 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck - fsgs diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d3fbe2dc03ea..efb0d1b1f15f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,7 +6,6 @@ #include #include #include -#include /* @@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req - rdgsbase \save_reg - GET_PERCPU_BASE \scratch_reg - wrgsbase \scratch_reg -.endm - #endif /* CONFIG_X86_64 */ .macro STACKLEAK_ERASE @@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#ifdef CONFIG_SMP - -/* - * CPU/node NR is loaded from the limit (size) field of a special segment - * descriptor entry in GDT. - */ -.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req - movq $__CPUNODE_SEG, \reg - lsl \reg, \reg -.endm - -/* - * Fetch the per-CPU GSBASE value for this processor and put it in @reg. - * We normally use %gs for accessing per-CPU data, but we are setting up - * %gs here and obviously can not use %gs itself to access per-CPU data. - */ -.macro GET_PERCPU_BASE reg:req - ALTERNATIVE \ - "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ - "RDPID \reg", \ - X86_FEATURE_RDPID - andq $VDSO_CPUNODE_MASK, \reg - movq __per_cpu_offset(, \reg, 8), \reg -.endm - -#else - -.macro GET_PERCPU_BASE reg:req - movq pcpu_unit_offsets(%rip), \reg -.endm - -#endif /* CONFIG_SMP */ - /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7f9f5119d6b1..3b7a0e8d3bc0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,7 +38,6 @@ #include #include #include -#include #include #include "calling.h" @@ -948,6 +947,7 @@ ENTRY(\sym) addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid jmp paranoid_exit .else @@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs. Return GSBASE related information - * in EBX depending on the availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y GSBASE value at entry, must be restored in paranoid_exit + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. - * - * Switching CR3 does not depend on kernel GSBASE so it can - * be done before switching to the kernel GSBASE. This is - * required for FSGSBASE because the kernel GSBASE has to - * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 - /* - * Handling GSBASE depends on the availability of FSGSBASE. - * - * Without FSGSBASE the kernel enforces that negative GSBASE - * values indicate kernel GSBASE. With FSGSBASE no assumptions - * can be made about the GSBASE value when entering from user - * space. - */ - ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE - - /* - * Read the current GSBASE and store it in in %rbx unconditionally, - * retrieve and set the current CPUs kernel GSBASE. The stored value - * has to be restored in paranoid_exit unconditionally. - */ - SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - ret - -.Lparanoid_entry_checkgs: - /* EBX = 1 -> kernel GSBASE active, no restore required */ - movl $1, %ebx - /* - * The kernel-enforced convention is a negative GSBASE indicates - * a kernel value. No SWAPGS needed on entry and exit. - */ - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - jns .Lparanoid_entry_swapgs - ret - -.Lparanoid_entry_swapgs: - SWAPGS - /* EBX = 0 -> SWAPGS required on exit */ - xorl %ebx, %ebx ret END(paranoid_entry) @@ -1241,47 +1204,28 @@ END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, there's no good reason to try - * to handle preemption here. + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. * - * R/EBX contains the GSBASE related information depending on the - * availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y User space GSBASE, must be restored unconditionally + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG - - /* Handle GS depending on FSGSBASE availability */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE - - /* With FSGSBASE enabled, unconditionally restore GSBASE */ - wrgsbase %rbx - jmp .Lparanoid_exit_no_swapgs; - -.Lparanoid_exit_checkgs: - /* On non-FSGSBASE systems, conditionally do SWAPGS */ - testl %ebx, %ebx + testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore - .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - .Lparanoid_exit_restore: - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1692,27 +1636,10 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - /* - * The above invocation of paranoid_entry stored the GSBASE - * related information in R/EBX depending on the availability - * of FSGSBASE. - * - * If FSGSBASE is enabled, restore the saved GSBASE value - * unconditionally, otherwise take the conditional SWAPGS path. - */ - ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE - - wrgsbase %rbx - jmp nmi_restore - -nmi_no_fsgsbase: - /* EBX == 0 -> invoke SWAPGS */ - testl %ebx, %ebx + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore - nmi_swapgs: SWAPGS_UNSAFE_STACK - nmi_restore: POP_REGS @@ -1743,11 +1670,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index aefd53767a5d..bca4c743de77 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,62 +19,35 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Must be protected by X86_FEATURE_FSGSBASE check. */ - -static __always_inline unsigned long rdfsbase(void) -{ - unsigned long fsbase; - - asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); - - return fsbase; -} - -static __always_inline unsigned long rdgsbase(void) -{ - unsigned long gsbase; - - asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); - - return gsbase; -} - -static __always_inline void wrfsbase(unsigned long fsbase) -{ - asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); -} - -static __always_inline void wrgsbase(unsigned long gsbase) -{ - asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); -} - -#include - /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - fsbase = rdfsbase(); - else - rdmsrl(MSR_FS_BASE, fsbase); + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static inline unsigned long x86_gsbase_read_cpu_inactive(void) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - wrfsbase(fsbase); - else - wrmsrl(MSR_FS_BASE, fsbase); + unsigned long gsbase; + + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + + return gsbase; } -extern unsigned long x86_gsbase_read_cpu_inactive(void); -extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); +static inline void x86_fsbase_write_cpu(unsigned long fsbase) +{ + wrmsrl(MSR_FS_BASE, fsbase); +} + +static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index d063841a17e3..f5a796da07f8 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,21 +306,6 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index c5ce54e749f6..6ebaae90e207 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,7 +5,4 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) -/* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) - #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0948fc25446a..482f74859fb7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,22 +366,6 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -static __init int x86_nofsgsbase_setup(char *arg) -{ - /* Require an exact match without trailing characters. */ - if (strlen(arg)) - return 0; - - /* Do not emit a message if the feature is not present. */ - if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) - return 1; - - setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); - pr_info("FSGSBASE disabled via kernel command line\n"); - return 1; -} -__setup("nofsgsbase", x86_nofsgsbase_setup); - /* * Protection Keys are not available in 32-bit mode. */ @@ -1387,12 +1371,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); - /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - cr4_set_bits(X86_CR4_FSGSBASE); - elf_hwcap2 |= HWCAP2_FSGSBASE; - } - /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f17c1a714779..8d6d92ebeb54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) @@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + check_memory_type_self_snoop_errata(c); /* * Get the number of SMT siblings early from the extended topology diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9356c1c9024d..aa5c064a6a22 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - wbinvd(); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (boot_cpu_has(X86_FEATURE_PGE)) { @@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - wbinvd(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8f239091c15d..250e4c4ac6d9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,40 +161,6 @@ enum which_selector { GS }; -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline unsigned long __rdgsbase_inactive(void) -{ - unsigned long gsbase; - - lockdep_assert_irqs_disabled(); - - native_swapgs(); - gsbase = rdgsbase(); - native_swapgs(); - - return gsbase; -} -NOKPROBE_SYMBOL(__rdgsbase_inactive); - -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline void __wrgsbase_inactive(unsigned long gsbase) -{ - lockdep_assert_irqs_disabled(); - - native_swapgs(); - wrgsbase(gsbase); - native_swapgs(); -} -NOKPROBE_SYMBOL(__wrgsbase_inactive); - /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* - * If FSGSBASE is enabled, we can't make any useful guesses - * about the base, and user code expects us to save the current - * value. Fortunately, reading the base directly is efficient. - */ - task->thread.fsbase = rdfsbase(); - local_irq_save(flags); - task->thread.gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); - } + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); } #if IS_ENABLED(CONFIG_KVM) @@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - /* Update the FS and GS selectors if they could have changed. */ - if (unlikely(prev->fsindex || next->fsindex)) - loadseg(FS, next->fsindex); - if (unlikely(prev->gsindex || next->gsindex)) - loadseg(GS, next->gsindex); - - /* Update the bases. */ - wrfsbase(next->fsbase); - __wrgsbase_inactive(next->gsbase); - } else { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); - } + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } -unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - } - - return gsbase; -} - -void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - __wrgsbase_inactive(gsbase); - local_irq_restore(flags); - } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); - } -} - unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.fsindex == 0)) + else if (task->thread.fsindex == 0) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.gsindex == 0)) + else if (task->thread.gsindex == 0) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); @@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; - save_fsgs(me); - p->thread.fsindex = me->thread.fsindex; - p->thread.fsbase = me->thread.fsbase; - p->thread.gsindex = me->thread.gsindex; - p->thread.gsbase = me->thread.gsbase; + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 186520198de7..fa07d526fe39 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ + protection_keys test_vdso test_vsyscall mov_ss_trap \ + syscall_arg_fault +TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 21fd4f94b5b0..5ab4c60c100e 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -35,6 +35,8 @@ static volatile sig_atomic_t want_segv; static volatile unsigned long segv_addr; +static unsigned short *shared_scratch; + static int nerrs; static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), @@ -242,16 +244,11 @@ static void do_remote_base() static __thread int set_thread_area_entry_number = -1; -static void do_unexpected_base(void) +static unsigned short load_gs(void) { /* - * The goal here is to try to arrange for GS == 0, GSBASE != - * 0, and for the the kernel the think that GSBASE == 0. - * - * To make the test as reliable as possible, this uses - * explicit descriptors. (This is not the only way. This - * could use ARCH_SET_GS with a low, nonzero base, but the - * relevant side effect of ARCH_SET_GS could change.) + * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think + * that GSBASE == 0 (i.e. thread.gsbase == 0). */ /* Step 1: tell the kernel that we have GSBASE == 0. */ @@ -271,8 +268,9 @@ static void do_unexpected_base(void) .useable = 0 }; if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { - printf("\tother thread: using LDT slot 0\n"); + printf("\tusing LDT slot 0\n"); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + return 0x7; } else { /* No modify_ldt for us (configured out, perhaps) */ @@ -294,20 +292,15 @@ static void do_unexpected_base(void) if (ret != 0) { printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); - return; + return 0; } - printf("\tother thread: using GDT slot %d\n", desc.entry_number); + printf("\tusing GDT slot %d\n", desc.entry_number); set_thread_area_entry_number = desc.entry_number; - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); + unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3); + asm volatile ("mov %0, %%gs" : : "rm" (gs)); + return gs; } - - /* - * Step 3: set the selector back to zero. On AMD chips, this will - * preserve GSBASE. - */ - - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); } void test_wrbase(unsigned short index, unsigned long base) @@ -346,12 +339,19 @@ static void *threadproc(void *ctx) if (ftx == 3) return NULL; - if (ftx == 1) + if (ftx == 1) { do_remote_base(); - else if (ftx == 2) - do_unexpected_base(); - else + } else if (ftx == 2) { + /* + * On AMD chips, this causes GSBASE != 0, GS == 0, and + * thread.gsbase == 0. + */ + + load_gs(); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + } else { errx(1, "helper thread got bad command"); + } ftx = 0; syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); @@ -453,12 +453,7 @@ static void test_ptrace_write_gsbase(void) if (child == 0) { printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n"); - /* - * Use the LDT setup and fetch the GSBASE from the LDT - * by switching to the (nonzero) selector (again) - */ - do_unexpected_base(); - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + *shared_scratch = load_gs(); if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) err(1, "PTRACE_TRACEME"); @@ -476,7 +471,7 @@ static void test_ptrace_write_gsbase(void) gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); - if (gs != 0x7) { + if (gs != *shared_scratch) { nerrs++; printf("[FAIL]\tGS is not prepared with nonzero\n"); goto END; @@ -494,16 +489,24 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != 0x7) { + if (gs != *shared_scratch) { nerrs++; printf("[FAIL]\tGS changed to %lx\n", gs); + + /* + * On older kernels, poking a nonzero value into the + * base would zero the selector. On newer kernels, + * this behavior has changed -- poking the base + * changes only the base and, if FSGSBASE is not + * available, this may have no effect. + */ + if (gs == 0) + printf("\tNote: this is expected behavior on older kernels.\n"); } else if (have_fsgsbase && (base != 0xFF)) { nerrs++; printf("[FAIL]\tGSBASE changed to %lx\n", base); } else { - printf("[OK]\tGS remained 0x7 %s"); - if (have_fsgsbase) - printf("and GSBASE changed to 0xFF"); + printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); printf("\n"); } } @@ -516,6 +519,9 @@ int main() { pthread_t thread; + shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + /* Probe FSGSBASE */ sethandler(SIGILL, sigill, 0); if (sigsetjmp(jmpbuf, 1) == 0) { diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 4e25d38c8bbd..bc0ecc2e862e 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -15,9 +15,30 @@ #include #include +#ifdef __x86_64__ +# define WIDTH "q" +#else +# define WIDTH "l" +#endif + /* Our sigaltstack scratch space. */ static unsigned char altstack_data[SIGSTKSZ]; +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +#define X86_EFLAGS_TF (1UL << 8) + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; +#ifdef __x86_64__ +#define REG_AX REG_RAX +#define REG_IP REG_RIP +#else +#define REG_AX REG_EAX +#define REG_IP REG_EIP +#endif + static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + long ax = (long)ctx->uc_mcontext.gregs[REG_AX]; - if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { - printf("[FAIL]\tAX had the wrong value: 0x%x\n", - ctx->uc_mcontext.gregs[REG_EAX]); + if (ax != -EFAULT && ax != -ENOSYS) { + printf("[FAIL]\tAX had the wrong value: 0x%lx\n", + (unsigned long)ax); n_errs++; } else { printf("[OK]\tSeems okay\n"); @@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) siglongjmp(jmpbuf, 1); } +static volatile sig_atomic_t sigtrap_consecutive_syscalls; + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + /* + * KVM has some bugs that can cause us to stop making progress. + * detect them and complain, but don't infinite loop or fail the + * test. + */ + + ucontext_t *ctx = (ucontext_t*)ctx_void; + unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP]; + + if (*ip == 0x340f || *ip == 0x050f) { + /* The trap was on SYSCALL or SYSENTER */ + sigtrap_consecutive_syscalls++; + if (sigtrap_consecutive_syscalls > 3) { + printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n"); + siglongjmp(jmpbuf, 1); + } + } else { + sigtrap_consecutive_syscalls = 0; + } +} + static void sigill(int sig, siginfo_t *info, void *ctx_void) { - printf("[SKIP]\tIllegal instruction\n"); + ucontext_t *ctx = (ucontext_t*)ctx_void; + unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP]; + + if (*ip == 0x0b0f) { + /* one of the ud2 instructions faulted */ + printf("[OK]\tSYSCALL returned normally\n"); + } else { + printf("[SKIP]\tIllegal instruction\n"); + } siglongjmp(jmpbuf, 1); } @@ -120,9 +183,48 @@ int main() "movl $-1, %%ebp\n\t" "movl $-1, %%esp\n\t" "syscall\n\t" - "pushl $0" /* make sure we segfault cleanly */ + "ud2" /* make sure we recover cleanly */ : : : "memory", "flags"); } + printf("[RUN]\tSYSENTER with TF and invalid state\n"); + sethandler(SIGTRAP, sigtrap, SA_ONSTACK); + + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "sysenter" + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + + printf("[RUN]\tSYSCALL with TF and invalid state\n"); + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "syscall\n\t" + "ud2" /* make sure we recover cleanly */ + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + return 0; }