From 2d5be37d686c4dae8e60d20283d6f44ac2c44f65 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Nov 2015 12:24:00 +0100 Subject: [PATCH 1/6] x86/microcode: Initialize the driver late when facilities are up Running microcode_init() from setup_arch() is a bad idea because not even kmalloc() is ready at that point and the loader does all kinds of allocations and init/registration with various subsystems. Make it a late initcall when required facilities are initialized so that the microcode driver initialization can succeed too. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151120112400.GC4028@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 1 + arch/x86/kernel/setup.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7fc27f1cca58..b3e94ef461fd 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -698,3 +698,4 @@ int __init microcode_init(void) return error; } +late_initcall(microcode_init); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 29db25f9a745..d2bbe343fda7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1250,8 +1250,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif - - microcode_init(); } #ifdef CONFIG_X86_32 From f10750536fa783cafb2653f6fa349d6e62337e42 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Nov 2015 12:59:00 -0800 Subject: [PATCH 2/6] x86/entry/64: Fix irqflag tracing wrt context tracking Paolo pointed out that enter_from_user_mode could be called while irqflags were traced as though IRQs were on. In principle, this could confuse lockdep. It doesn't cause any problems that I've seen in any configuration, but if I build with CONFIG_DEBUG_LOCKDEP=y, enable a nohz_full CPU, and add code like: if (irqs_disabled()) { spin_lock(&something); spin_unlock(&something); } to the top of enter_from_user_mode, then lockdep will complain without this fix. It seems that lockdep's irqflags sanity checks are too weak to detect this bug without forcing the issue. This patch adds one byte to normal kernels, and it's IMO a bit ugly. I haven't spotted a better way to do this yet, though. The issue is that we can't do TRACE_IRQS_OFF until after SWAPGS (if needed), but we're also supposed to do it before calling C code. An alternative approach would be to call trace_hardirqs_off in enter_from_user_mode. That would be less code and would not bloat normal kernels at all, but it would be harder to see how the code worked. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/86237e362390dfa6fec12de4d75a238acb0ae787.1447361906.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 53616ca03244..a55697d19824 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -509,6 +509,17 @@ END(irq_entries_start) * tracking that we're in kernel mode. */ SWAPGS + + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, + * the simplest way to handle it is to just call it twice if + * we enter from user mode. There's no reason to optimize this since + * TRACE_IRQS_OFF is a no-op if lockdep is off. + */ + TRACE_IRQS_OFF + #ifdef CONFIG_CONTEXT_TRACKING call enter_from_user_mode #endif @@ -1049,12 +1060,18 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). + */ + TRACE_IRQS_OFF #ifdef CONFIG_CONTEXT_TRACKING call enter_from_user_mode #endif + ret .Lerror_entry_done: - TRACE_IRQS_OFF ret From 656279a1f3b210cf48ccc572fd7c6b8e2250be77 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 22 Nov 2015 18:16:15 -0500 Subject: [PATCH 3/6] x86 smpboot: Re-enable init_udelay=0 by default on modern CPUs commit f1ccd249319e allowed the cmdline "cpu_init_udelay=" to work with all values, including the default of 10000. But in setting the default of 10000, it over-rode the code that sets the delay 0 on modern processors. Also, tidy up use of INT/UINT. Fixes: f1ccd249319e "x86/smpboot: Fix cpu_init_udelay=10000 corner case boot parameter misbehavior" Reported-by: Shane Signed-off-by: Len Brown Cc: dparsons@brightdsl.net Cc: stable@kernel.org Link: http://lkml.kernel.org/r/9082eb809ef40dad02db714759c7aaf618c518d4.1448232494.git.len.brown@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5ecbc..fbabe4fcc7fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } From 70f1528747651b20c7769d3516ade369f9963237 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 30 Nov 2015 11:10:33 +0100 Subject: [PATCH 4/6] x86/mm: Fix regression with huge pages on PAE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent PAT patchset has caused issue on 32-bit PAE machines: page:eea45000 count:0 mapcount:-128 mapping: (null) index:0x0 flags: 0x40000000() page dumped because: VM_BUG_ON_PAGE(page_mapcount(page) < 0) ------------[ cut here ]------------ kernel BUG at /home/build/linux-boris/mm/huge_memory.c:1485! invalid opcode: 0000 [#1] SMP [...] Call Trace: unmap_single_vma ? __wake_up unmap_vmas unmap_region do_munmap vm_munmap SyS_munmap do_fast_syscall_32 ? __do_page_fault sysenter_past_esp Code: ... EIP: [] zap_huge_pmd+0x240/0x260 SS:ESP 0068:f6459d98 The problem is in pmd_pfn_mask() and pmd_flags_mask(). These helpers use PMD_PAGE_MASK to calculate resulting mask. PMD_PAGE_MASK is 'unsigned long', not 'unsigned long long' as phys_addr_t is on 32-bit PAE (ARCH_PHYS_ADDR_T_64BIT). As a result, the upper bits of resulting mask get truncated. pud_pfn_mask() and pud_flags_mask() aren't problematic since we don't have PUD page table level on 32-bit systems, but it's reasonable to keep them consistent with PMD counterpart. Introduce PHYSICAL_PMD_PAGE_MASK and PHYSICAL_PUD_PAGE_MASK in addition to existing PHYSICAL_PAGE_MASK and reworks helpers to use them. Reported-and-Tested-by: Boris Ostrovsky Signed-off-by: Kirill A. Shutemov [ Fix -Woverflow warnings from the realmode code. ] Signed-off-by: Borislav Petkov Reviewed-by: Toshi Kani Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jürgen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: linux-mm Fixes: f70abb0fc3da ("x86/asm: Fix pud/pmd interfaces to handle large PAT bit") Link: http://lkml.kernel.org/r/1448878233-11390-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- arch/x86/boot/boot.h | 1 - arch/x86/boot/video-mode.c | 2 ++ arch/x86/boot/video.c | 2 ++ arch/x86/include/asm/page_types.h | 18 ++++++++++-------- arch/x86/include/asm/pgtable_types.h | 14 ++++---------- arch/x86/include/asm/x86_init.h | 1 - 6 files changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 0033e96c3f09..9011a88353de 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "bitops.h" #include "ctype.h" diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index aa8a96b052e3..95c7a818c0ed 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c @@ -19,6 +19,8 @@ #include "video.h" #include "vesa.h" +#include + /* * Common variables */ diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 05111bb8d018..77780e386e9b 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -13,6 +13,8 @@ * Select video mode */ +#include + #include "boot.h" #include "video.h" #include "vesa.h" diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c5b7fb2774d0..cc071c6f7d4d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -9,20 +9,22 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) - -/* Cast PAGE_MASK to a signed type so that it is sign-extended if - virtual addresses are 32-bits but physical addresses are larger - (ie, 32-bit PAE). */ -#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) - #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) + +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if + virtual addresses are 32-bits but physical addresses are larger + (ie, 32-bit PAE). */ +#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) + #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index dd5b0aa9dd2f..a471cadb9630 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -279,17 +279,14 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) - return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; + return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { - if (native_pud_val(pud) & _PAGE_PSE) - return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK); - else - return ~PTE_PFN_MASK; + return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) @@ -300,17 +297,14 @@ static inline pudval_t pud_flags(pud_t pud) static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) - return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; + return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { - if (native_pmd_val(pmd) & _PAGE_PSE) - return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK); - else - return ~PTE_PFN_MASK; + return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 48d34d28f5a6..cd0fc0cc78bc 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include #include struct mpc_bus; From 8e8efe0379bd93e8219ca0fc6fa80b5dd85b09cb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 30 Nov 2015 16:31:13 -0800 Subject: [PATCH 5/6] x86/mpx: Fix instruction decoder condition MPX decodes instructions in order to tell which bounds register was violated. Part of this decoding involves looking at the "REX prefix" which is a special instrucion prefix used to retrofit support for new registers in to old instructions. The X86_REX_*() macros are defined to return actual bit values: #define X86_REX_R(rex) ((rex) & 4) *not* boolean values. However, the MPX code was checking for them like they were booleans. This might have led to us mis-decoding the "REX prefix" and giving false information out to userspace about bounds violations. X86_REX_B() actually is bit 1, so this is really only broken for the X86_REX_X() case. Fix the conditionals up to tolerate the non-boolean values. Fixes: fcc7ffd67991 "x86, mpx: Decode MPX instruction to get bound violation information" Reported-by: Dan Carpenter Signed-off-by: Dave Hansen Cc: x86@kernel.org Cc: Dave Hansen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151201003113.D800C1E0@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/mpx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1202d5ca2fb5..b2fd67da1701 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value) == 1) + if (X86_REX_X(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; From 22eab1108781eff09961ae7001704f7bd8fb1dce Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 1 Dec 2015 00:54:36 +0300 Subject: [PATCH 6/6] x86/signal: Fix restart_syscall number for x32 tasks When restarting a syscall with regs->ax == -ERESTART_RESTARTBLOCK, regs->ax is assigned to a restart_syscall number. For x32 tasks, this syscall number must have __X32_SYSCALL_BIT set, otherwise it will be an x86_64 syscall number instead of a valid x32 syscall number. This issue has been there since the introduction of x32. Reported-by: strace/tests/restart_syscall.test Reported-and-tested-by: Elvira Khabirova Signed-off-by: Dmitry V. Levin Cc: Elvira Khabirova Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151130215436.GA25996@altlinux.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/signal.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b7ffb7c00075..cb6282c3638f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -690,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(failed, ksig, stepping); } -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) + return __NR_restart_syscall; +#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ + return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : + __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +} /* * Note that 'init' is a special process: it doesn't get signals it doesn't @@ -724,7 +727,7 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; + regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; }