From 947134d9b00f342415af7eddd42a5fce7262a1b9 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 4 Dec 2017 11:45:21 -0500 Subject: [PATCH 01/11] x86/smpboot: Do not use smp_num_siblings in __max_logical_packages calculation Documentation/x86/topology.txt defines smp_num_siblings as "The number of threads in a core". Since commit bbb65d2d365e ("x86: use cpuid vector 0xb when available for detecting cpu topology") smp_num_siblings is the maximum number of threads in a core. If Simultaneous MultiThreading (SMT) is disabled on a system, smp_num_siblings is 2 and not 1 as expected. Use topology_max_smt_threads(), which contains the active numer of threads, in the __max_logical_packages calculation. On a single socket, single core, single thread system __max_smt_threads has not been updated when the __max_logical_packages calculation happens, so its zero which makes the package estimate fail. Initialize it to one, which is the minimum number of threads on a core. [ tglx: Folded the __max_smt_threads fix in ] Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate") Reported-by: Jakub Kicinski Signed-off-by: Prarit Bhargava Tested-by: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: "netdev@vger.kernel.org" Cc: Clark Williams Link: https://lkml.kernel.org/r/20171204164521.17870-1-prarit@redhat.com --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05a97d5fe298..35cb20994e32 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -1304,7 +1304,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. */ - ncpus = cpu_data(0).booted_cores * smp_num_siblings; + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); From 08529078d8d9adf689bf39cc38d53979a0869970 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:55 +0300 Subject: [PATCH 02/11] x86/boot/compressed/64: Detect and handle 5-level paging at boot-time Prerequisite for fixing the current problem of instantaneous reboots when a 5-level paging kernel is booted on 4-level paging hardware. At the same time this change prepares the decompression code to boot-time switching between 4- and 5-level paging. [ tglx: Folded the GCC < 5 fix. ] Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-2-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/head_64.S | 16 +++++++++++---- arch/x86/boot/compressed/pgtable_64.c | 28 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 arch/x86/boot/compressed/pgtable_64.c diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1e9c322e973a..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -305,10 +305,18 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Check if 5-level paging has already enabled */ - movq %cr4, %rax - testl $X86_CR4_LA57, %eax - jnz lvl5 + /* + * Check if we need to enable 5-level paging. + * RSI holds real mode data and need to be preserved across + * a function call. + */ + pushq %rsi + call l5_paging_required + popq %rsi + + /* If l5_paging_required() returned zero, we're done here. */ + cmpq $0, %rax + je lvl5 /* * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ + /* Check if leaf 7 is supported. */ + + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return 0; + + /* Check if 5-level paging has already been enabled. */ + if (native_read_cr4() & X86_CR4_LA57) + return 0; + + return 1; +} From 6d7e0ba2d2be9e50cccba213baf07e0e183c1b24 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:56 +0300 Subject: [PATCH 03/11] x86/boot/compressed/64: Print error if 5-level paging is not supported If the machine does not support the paging mode for which the kernel was compiled, the boot process cannot continue. It's not possible to let the kernel detect the mismatch as it does not even reach the point where cpu features can be evaluted due to a triple fault in the KASLR setup. Instead of instantaneous silent reboot, emit an error message which gives the user the information why the boot fails. Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Reported-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-3-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/misc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value) } } +static bool l5_supported(void) +{ + /* Check if leaf 7 is supported. */ + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} + #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); + if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { + error("This linux kernel as configured requires 5-level paging\n" + "This CPU does not support the required 'cr4.la57' feature\n" + "Unable to boot - please use a kernel appropriate for your CPU\n"); + } + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; From 6d60ce384d1d5ca32b595244db4077a419acc687 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 27 Nov 2017 08:51:39 +0100 Subject: [PATCH 04/11] x86/mm/kmmio: Fix mmiotrace for page unaligned addresses If something calls ioremap() with an address not aligned to PAGE_SIZE, the returned address might be not aligned as well. This led to a probe registered on exactly the returned address, but the entire page was armed for mmiotracing. On calling iounmap() the address passed to unregister_kmmio_probe() was PAGE_SIZE aligned by the caller leading to a complete freeze of the machine. We should always page align addresses while (un)registerung mappings, because the mmiotracer works on top of pages, not mappings. We still keep track of the probes based on their real addresses and lengths though, because the mmiotrace still needs to know what are mapped memory regions. Also move the call to mmiotrace_iounmap() prior page aligning the address, so that all probes are unregistered properly, otherwise the kernel ends up failing memory allocations randomly after disabling the mmiotracer. Tested-by: Lyude Signed-off-by: Karol Herbst Acked-by: Pekka Paalanen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: nouveau@lists.freedesktop.org Link: http://lkml.kernel.org/r/20171127075139.4928-1-kherbst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/kmmio.c | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr) return; } + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); From f79ce87fa49da778a1ad54c7d3c6755e13cf8489 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 22:51:20 +0800 Subject: [PATCH 05/11] x86/build: Don't verify mtools configuration file for isoimage If mtools.conf is not generated before, 'make isoimage' could complain: Kernel: arch/x86/boot/bzImage is ready (#597) GENIMAGE arch/x86/boot/image.iso *** Missing file: arch/x86/boot/mtools.conf arch/x86/boot/Makefile:144: recipe for target 'isoimage' failed mtools.conf is not used for isoimage generation, so do not check it. Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 4366d57af1 ("x86/build: Factor out fdimage/isoimage generation commands to standalone script") Link: http://lkml.kernel.org/r/1512053480-8083-1-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 49f4970f693b..c9e8499fbfe7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -44,9 +44,9 @@ FDINITRD=$6 # Make sure the files actually exist verify "$FBZIMAGE" -verify "$MTOOLSRC" genbzdisk() { + verify "$MTOOLSRC" mformat a: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - a:syslinux.cfg @@ -57,6 +57,7 @@ genbzdisk() { } genfdimage144() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null mformat v: syslinux $FIMAGE @@ -68,6 +69,7 @@ genfdimage144() { } genfdimage288() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null mformat w: syslinux $FIMAGE From 0a373d4fc248cb707821d7dad54ce6d5bcb0cdfe Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 30 Nov 2017 15:35:54 +0300 Subject: [PATCH 06/11] x86/unwinder/guess: Prevent using CONFIG_UNWINDER_GUESS=y with CONFIG_STACKDEPOT=y Stackdepot doesn't work well with CONFIG_UNWINDER_GUESS=y. The 'guess' unwinder generate awfully large and inaccurate stacktraces, thus stackdepot can't deduplicate stacktraces because they all look like unique. Eventually stackdepot reaches its capacity limit: WARNING: CPU: 0 PID: 545 at lib/stackdepot.c:119 depot_save_stack+0x28e/0x550 Call Trace: ? kasan_kmalloc+0x144/0x160 ? depot_save_stack+0x1f5/0x550 ? do_raw_spin_unlock+0xda/0xf0 ? preempt_count_sub+0x13/0xc0 <...90 lines...> ? do_raw_spin_unlock+0xda/0xf0 Add a STACKDEPOT=n dependency to UNWINDER_GUESS to avoid the problem. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171130123554.4330-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT + depends on !STACKDEPOT ---help--- This option enables the "guess" unwinder for unwinding kernel stack traces. It scans the stack and reports every kernel text address it From 090edbe23ff57940fca7f57d9165ce57a826bd7a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:05 -0800 Subject: [PATCH 07/11] x86/power/64: Use struct desc_ptr for the IDT in struct saved_context x86_64's saved_context nonsensically used separate idt_limit and idt_base fields and then cast &idt_limit to struct desc_ptr *. This was correct (with -fno-strict-aliasing), but it's confusing, served no purpose, and required #ifdeffery. Simplify this by using struct desc_ptr directly. No change in functionality. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/967909ce38d341b01d45eff53e278e2728a3a93a.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_64.h | 3 +-- arch/x86/power/cpu.c | 11 +---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..600e9e0aea51 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -30,8 +30,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 5191de14f4df..472bc8c8212b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -219,12 +215,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ -#ifdef CONFIG_X86_32 load_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); -#endif #ifdef CONFIG_X86_64 /* From 896c80bef4d3b357814a476663158aaf669d0fb3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:06 -0800 Subject: [PATCH 08/11] x86/power/32: Move SYSENTER MSR restoration to fix_processor_context() x86_64 restores system call MSRs in fix_processor_context(), and x86_32 restored them along with segment registers. The 64-bit variant makes more sense, so move the 32-bit code to match the 64-bit code. No side effects are expected to runtime behavior. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/65158f8d7ee64dd6bbc6c1c83b3b34aaa854e3ae.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/power/cpu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 472bc8c8212b..033c61e6891b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -174,6 +174,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -237,12 +240,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) loadsegment(fs, ctxt->fs); loadsegment(gs, ctxt->gs); loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); #else /* CONFIG_X86_64 */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); From 7ee18d677989e99635027cee04c878950e0752b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:07 -0800 Subject: [PATCH 09/11] x86/power: Make restore_processor_context() sane My previous attempt to fix a couple of bugs in __restore_processor_context(): 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") ... introduced yet another bug, breaking suspend-resume. Rather than trying to come up with a minimal fix, let's try to clean it up for real. This patch fixes quite a few things: - The old code saved a nonsensical subset of segment registers. The only registers that need to be saved are those that contain userspace state or those that can't be trivially restored without percpu access working. (On x86_32, we can restore percpu access by writing __KERNEL_PERCPU to %fs. On x86_64, it's easier to save and restore the kernel's GSBASE.) With this patch, we restore hardcoded values to the kernel state where applicable and explicitly restore the user state after fixing all the descriptor tables. - We used to use an unholy mix of inline asm and C helpers for segment register access. Let's get rid of the inline asm. This fixes the reported s2ram hangs and make the code all around more logical. Analyzed-by: Linus Torvalds Reported-by: Jarkko Nikula Reported-by: Pavel Machek Tested-by: Jarkko Nikula Tested-by: Pavel Machek Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Fixes: 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") Link: http://lkml.kernel.org/r/398ee68e5c0f766425a7b746becfc810840770ff.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_32.h | 8 ++- arch/x86/include/asm/suspend_64.h | 16 +++++- arch/x86/power/cpu.c | 81 ++++++++++++++++--------------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 600e9e0aea51..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 033c61e6891b..36a28eddb435 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -99,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -189,9 +185,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -214,46 +213,50 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ + /* Restore the IDT. */ load_idt(&ctxt->idt); -#ifdef CONFIG_X86_64 /* - * We need GSBASE restored before percpu access can work. - * percpu access can happen in exception handlers or in complicated - * helpers like load_gs_index(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ - wrmsrl(MSR_GS_BASE, ctxt->gs_base); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + /* + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). + */ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); +#else + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ fix_processor_context(); /* - * Restore segment registers. This happens after restoring the GDT - * and LDT, which happen in fix_processor_context(). + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); /* - * Restore FSBASE and user GSBASE after reloading the respective - * segment selectors. + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif do_fpu_end(); From f5b5fab1780c98b74526dbac527574bd02dc16f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Dec 2017 10:38:36 -0800 Subject: [PATCH 10/11] x86/decoder: Fix and update the opcodes map Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. Fix INVPID to INVVPID. Add UD0 and UD1 instruction opcodes. Also sync the objtool and perf tooling copies of this file. Signed-off-by: Randy Dunlap Acked-by: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 13 +++++++++++-- tools/objtool/arch/x86/insn/x86-opcode-map.txt | 15 ++++++++++++--- .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 15 ++++++++++++--- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index c4d55919fac1..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt +++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM From 215eada73e77ede7e15531d99f712481ddd429be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Dec 2017 13:36:56 +0100 Subject: [PATCH 11/11] objtool: Resync objtool's instruction decoder source code copy with the kernel's latest version This fixes the following warning: warning: objtool: x86 instruction decoder differs from kernel Note that there are cleanups queued up for v4.16 that will make this warning more informative and will make the syncing easier as well. Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/insn/inat.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/objtool/arch/x86/insn/inat.h b/tools/objtool/arch/x86/insn/inat.h index 125ecd2a300d..52dc8d911173 100644 --- a/tools/objtool/arch/x86/insn/inat.h +++ b/tools/objtool/arch/x86/insn/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx);