From 21279157efffe5e7258483809942d576cb802768 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Mon, 16 Jul 2018 12:03:38 +0800 Subject: [PATCH 01/60] x86/pti: Make pti_set_kernel_image_nonglobal() static pti_set_kernel_image_nonglobal() is only used in pti.c, make it static. Signed-off-by: Jiang Biao Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Cc: luto@kernel.org Cc: hpa@zytor.com Cc: albcamus@gmail.com Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1531713820-24544-4-git-send-email-jiang.biao2@zte.com.cn --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4d418e705878..9277e9ba92b5 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -466,7 +466,7 @@ void pti_clone_kernel_text(void) * the other set_memory.h functions. Just extern it. */ extern int set_memory_nonglobal(unsigned long addr, int numpages); -void pti_set_kernel_image_nonglobal(void) +static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the From d9f4426c73002957be5dd39936f44a09498f7560 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Wed, 18 Jul 2018 08:03:14 +0800 Subject: [PATCH 02/60] x86/speculation: Remove SPECTRE_V2_IBRS in enum spectre_v2_mitigation SPECTRE_V2_IBRS in enum spectre_v2_mitigation is never used. Remove it. Signed-off-by: Jiang Biao Signed-off-by: Thomas Gleixner Cc: hpa@zytor.com Cc: dwmw2@amazon.co.uk Cc: konrad.wilk@oracle.com Cc: bp@suse.de Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1531872194-39207-1-git-send-email-jiang.biao2@zte.com.cn --- arch/x86/include/asm/nospec-branch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f6f6c63da62f..c99082e2ef13 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -214,7 +214,6 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS, }; /* The Speculative Store Bypass disable variants */ From 9e97b73fdb235345a826519862a52a7398c89eb8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:38 +0200 Subject: [PATCH 03/60] x86/asm-offsets: Move TSS_sp0 and TSS_sp1 to asm-offsets.c These offsets will be used in 32 bit assembly code as well, so make them available for all of x86 code. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Reviewed-by: Andy Lutomirski Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-2-git-send-email-joro@8bytes.org --- arch/x86/kernel/asm-offsets.c | 4 ++++ arch/x86/kernel/asm-offsets_64.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index dcb008c320fe..a1e16286a832 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,4 +103,8 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + + /* Offset for sp0 and sp1 into the tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b2dcd161f514..3b9405e7ba2b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -65,8 +65,6 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); - OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); - OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_STACKPROTECTOR From ae2e565bc6aaee3f3db420fec5fdd39755c9813e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:39 +0200 Subject: [PATCH 04/60] x86/entry/32: Rename TSS_sysenter_sp0 to TSS_entry2task_stack The stack address doesn't need to be stored in tss.sp0 if the stack is switched manually like on sysenter. Rename the offset so that it still makes sense when its location is changed in later patches. This stackk will also be used for all kernel-entry points, not just sysenter. Reflect that and the fact that it is the offset to the task-stack location in the name as well. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-3-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 2 +- arch/x86/kernel/asm-offsets_32.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c371bfee137a..39f711ad0fc9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -412,7 +412,7 @@ ENTRY(xen_sysenter_target) * 0(%ebp) arg6 */ ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp), %esp + movl TSS_entry2task_stack(%esp), %esp .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a4a3be399f4b..15b3f45b69cd 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,8 +46,9 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + /* Offset from the entry stack to task stack stored in TSS */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR From a6b744f3ce9d017dd86b28355de2d8e0d36496d4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:40 +0200 Subject: [PATCH 05/60] x86/entry/32: Load task stack from x86_tss.sp1 in SYSENTER handler x86_tss.sp0 will be used to point to the entry stack later to use it as a trampoline stack for other kernel entry points besides SYSENTER. So store the real task stack pointer in x86_tss.sp1, which is otherwise unused by the hardware, as Linux doesn't make use of Ring 1. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-4-git-send-email-joro@8bytes.org --- arch/x86/kernel/asm-offsets_32.c | 9 +++++++-- arch/x86/kernel/process_32.c | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 15b3f45b69cd..82826f2275cc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,9 +46,14 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the entry stack to task stack stored in TSS */ + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ DEFINE(TSS_entry2task_stack, - offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..ec62cc77388d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,6 +290,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); + /* SYSENTER reads the task-stack from tss.sp1 */ + this_cpu_write(cpu_tss_rw.x86_tss.sp1, next_p->thread.sp0); /* * Restore %gs if needed (which is common) From 46eabca284f9f27fe94698a068a4a21ba93b4975 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:41 +0200 Subject: [PATCH 06/60] x86/entry/32: Put ESPFIX code into a macro This makes it easier to split up the shared iret code path. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-5-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 97 ++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 39f711ad0fc9..ef7d65315e8e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -221,6 +221,54 @@ POP_GS_EX .endm +.macro CHECK_AND_APPLY_ESPFIX +#ifdef CONFIG_X86_ESPFIX32 +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + + ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX + + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + jne .Lend_\@ # returning to user-space with LDT SS + + /* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + lss (%esp), %esp /* switch to espfix segment */ +.Lend_\@: +#endif /* CONFIG_X86_ESPFIX32 */ +.endm /* * %eax: prev task * %edx: next task @@ -547,21 +595,7 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET .Lrestore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX - - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - /* - * Warning: PT_OLDSS(%esp) contains the wrong/random values if we - * are returning to the kernel. - * See comments in process.c:copy_thread() for details. - */ - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je .Lldt_ss # returning to user-space with LDT SS -#endif + CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code .Lirq_return: @@ -579,39 +613,6 @@ ENTRY(iret_exc ) jmp common_exception .previous _ASM_EXTABLE(.Lirq_return, iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 -.Lldt_ss: -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* - * Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the IRET: - */ - DISABLE_INTERRUPTS(CLBR_ANY) - lss (%esp), %esp /* switch to espfix segment */ - jmp .Lrestore_nocheck -#endif ENDPROC(entry_INT80_32) .macro FIXUP_ESPFIX_STACK From 8e676ced31e9d1448d3ffc4159586a259cc67f30 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:42 +0200 Subject: [PATCH 07/60] x86/entry/32: Unshare NMI return path NMI will no longer use most of the shared return path, because NMI needs special handling when the CR3 switches for PTI are added. Prepare for that change. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-6-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ef7d65315e8e..43641310b6e3 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1017,7 +1017,7 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp .Lrestore_all_notrace + jmp .Lnmi_return .Lnmi_from_sysenter_stack: /* @@ -1028,7 +1028,11 @@ ENTRY(nmi) movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi movl %ebx, %esp - jmp .Lrestore_all_notrace + +.Lnmi_return: + CHECK_AND_APPLY_ESPFIX + RESTORE_REGS 4 + jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: From 0d2eb73b29996684d5bbb72f85c74b47b4c359f7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:43 +0200 Subject: [PATCH 08/60] x86/entry/32: Split off return-to-kernel path Use a separate return path when returning to the kernel. This allows to put the PTI cr3-switch and the switch to the entry-stack into the return-to-user path without further checking. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-7-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 43641310b6e3..7251c4f3e99e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -65,7 +65,7 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all +# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -399,9 +399,9 @@ ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) .Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all + jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_all_kernel call preempt_schedule_irq jmp .Lneed_resched END(resume_kernel) @@ -606,6 +606,11 @@ restore_all: */ INTERRUPT_RETURN +restore_all_kernel: + TRACE_IRQS_IRET + RESTORE_REGS 4 + jmp .Lirq_return + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code From 45d7b255747c21fc4b1f5043bee0754d39c3bdbf Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:44 +0200 Subject: [PATCH 09/60] x86/entry/32: Enter the kernel via trampoline stack Use the entry-stack as a trampoline to enter the kernel. The entry-stack is already in the cpu_entry_area and will be mapped to userspace when PTI is enabled. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-8-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 119 +++++++++++++++++++++++++------ arch/x86/include/asm/switch_to.h | 14 +++- arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/cpu/common.c | 5 +- arch/x86/kernel/process.c | 2 - arch/x86/kernel/process_32.c | 2 - 6 files changed, 115 insertions(+), 28 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7251c4f3e99e..fea49ec345ba 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -154,7 +154,7 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL pt_regs_ax=%eax +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS pushl %fs @@ -173,6 +173,12 @@ movl $(__KERNEL_PERCPU), %edx movl %edx, %fs SET_KERNEL_GS %edx + + /* Switch to kernel stack if necessary */ +.if \switch_stacks > 0 + SWITCH_TO_KERNEL_STACK +.endif + .endm /* @@ -269,6 +275,73 @@ .Lend_\@: #endif /* CONFIG_X86_ESPFIX32 */ .endm + + +/* + * Called with pt_regs fully populated and kernel segments loaded, + * so we can access PER_CPU and use the integer registers. + * + * We need to be very careful here with the %esp switch, because an NMI + * can happen everywhere. If the NMI handler finds itself on the + * entry-stack, it will overwrite the task-stack and everything we + * copied there. So allocate the stack-frame on the task-stack and + * switch to it before we do any copying. + */ +.macro SWITCH_TO_KERNEL_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Are we on the entry stack? Bail out if not! */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jae .Lend_\@ + + /* Load stack pointer into %esi and %edi */ + movl %esp, %esi + movl %esi, %edi + + /* Move %edi to the top of the entry stack */ + andl $(MASK_entry_stack), %edi + addl $(SIZEOF_entry_stack), %edi + + /* Load top of task-stack into %edi */ + movl TSS_entry2task_stack(%edi), %edi + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) + jz .Lcopy_pt_regs_\@ + + /* + * Stack-frame contains 4 additional segment registers when + * coming from VM86 mode + */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Allocate frame on task-stack */ + subl %ecx, %edi + + /* Switch to task-stack */ + movl %edi, %esp + + /* + * We are now on the task-stack and can safely copy over the + * stack-frame + */ + shrl $2, %ecx + cld + rep movsl + +.Lend_\@: +.endm + /* * %eax: prev task * %edx: next task @@ -469,7 +542,7 @@ ENTRY(entry_SYSENTER_32) pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ /* * SYSENTER doesn't filter flags, so we need to clear NT, AC @@ -580,7 +653,8 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + + SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -677,7 +751,8 @@ END(irq_entries_start) common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax @@ -685,16 +760,16 @@ common_interrupt: jmp ret_from_intr ENDPROC(common_interrupt) -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL switch_stacks=1; \ + ENCODE_FRAME_POINTER; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) \ @@ -926,16 +1001,20 @@ common_exception: pushl %es pushl %ds pushl %eax + movl $(__USER_DS), %eax + movl %eax, %ds + movl %eax, %es + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs pushl %ebp pushl %edi pushl %esi pushl %edx pushl %ecx pushl %ebx + SWITCH_TO_KERNEL_STACK ENCODE_FRAME_POINTER cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address @@ -943,9 +1022,6 @@ common_exception: movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi @@ -964,6 +1040,7 @@ ENTRY(debug) */ ASM_CLAC pushl $-1 # mark this as an int + SAVE_ALL ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 @@ -999,6 +1076,7 @@ END(debug) */ ENTRY(nmi) ASM_CLAC + #ifdef CONFIG_X86_ESPFIX32 pushl %eax movl %ss, %eax @@ -1066,7 +1144,8 @@ END(nmi) ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index eb5f7999a893..8bc2f70b452e 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -89,13 +89,23 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { - /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task->thread.sp0); + else + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else + /* + * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That + * doesn't work on x86-32 because sp1 and + * cpu_current_top_of_stack have different values (because of + * the non-zero stack-padding on 32bit). + */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif + } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a1e16286a832..01de31db300d 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,6 +103,7 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); /* Offset for sp0 and sp1 into the tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eb4cb3efd20e..43a927eb9c09 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1804,11 +1804,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, curr); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 30ca2d1a9231..c93fcfdf1673 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, -#ifdef CONFIG_X86_64 /* * .sp1 is cpu_current_top_of_stack. The init task never * runs user code, but cpu_current_top_of_stack should still * be well defined before the first context switch. */ .sp1 = TOP_OF_INIT_STACK, -#endif #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ec62cc77388d..0ae659de21eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,8 +290,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* SYSENTER reads the task-stack from tss.sp1 */ - this_cpu_write(cpu_tss_rw.x86_tss.sp1, next_p->thread.sp0); /* * Restore %gs if needed (which is common) From e5862d0515ad970ccec6208ecf5bb0cffe291ea3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:45 +0200 Subject: [PATCH 10/60] x86/entry/32: Leave the kernel via trampoline stack Switch back to the trampoline stack before returning to userspace. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-9-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 79 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index fea49ec345ba..a905e6215ea9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -342,6 +342,60 @@ .Lend_\@: .endm +/* + * Switch back from the kernel stack to the entry stack. + * + * The %esp register must point to pt_regs on the task stack. It will + * first calculate the size of the stack-frame to copy, depending on + * whether we return to VM86 mode or not. With that it uses 'rep movsl' + * to copy the contents of the stack over to the entry stack. + * + * We must be very careful here, as we can't trust the contents of the + * task-stack once we switched to the entry-stack. When an NMI happens + * while on the entry-stack, the NMI handler will switch back to the top + * of the task stack, overwriting our stack-frame we are about to copy. + * Therefore we switch the stack only after everything is copied over. + */ +.macro SWITCH_TO_ENTRY_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp) + jz .Lcopy_pt_regs_\@ + + /* Additional 4 registers to copy when returning to VM86 mode */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Initialize source and destination for movsl */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + subl %ecx, %edi + movl %esp, %esi + + /* Save future stack pointer in %ebx */ + movl %edi, %ebx + + /* Copy over the stack-frame */ + shrl $2, %ecx + cld + rep movsl + + /* + * Switch to entry-stack - needs to happen after everything is + * copied because the NMI handler will overwrite the task-stack + * when on entry-stack + */ + movl %ebx, %esp + +.Lend_\@: +.endm + /* * %eax: prev task * %edx: next task @@ -581,25 +635,45 @@ ENTRY(entry_SYSENTER_32) /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ + + /* + * Setup entry stack - we keep the pointer in %eax and do the + * switch after almost all user-state is restored. + */ + + /* Load entry stack pointer and allocate frame for eflags/eax */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax + subl $(2*4), %eax + + /* Copy eflags and eax to entry stack */ + movl PT_EFLAGS(%esp), %edi + movl PT_EAX(%esp), %esi + movl %edi, (%eax) + movl %esi, 4(%eax) + + /* Restore user registers and segments */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS + popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ popl %esi /* pt_regs->si */ popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ - popl %eax /* pt_regs->ax */ + + /* Switch to entry stack */ + movl %eax, %esp /* * Restore all flags except IF. (We restore IF separately because * STI gives a one-instruction window in which we won't be interrupted, * whereas POPF does not.) */ - addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ btrl $X86_EFLAGS_IF_BIT, (%esp) popfl + popl %eax /* * Return back to the vDSO, which will pop ecx and edx. @@ -668,6 +742,7 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET + SWITCH_TO_ENTRY_STACK .Lrestore_all_notrace: CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: From 8b376fae0514dc7ee04786e2327169e39d12e51b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:46 +0200 Subject: [PATCH 11/60] x86/entry/32: Introduce SAVE_ALL_NMI and RESTORE_ALL_NMI These macros will be used in the NMI handler code and replace plain SAVE_ALL and RESTORE_REGS there. The NMI-specific CR3-switch will be added to these macros later. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-10-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a905e6215ea9..763592596727 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -181,6 +181,9 @@ .endm +.macro SAVE_ALL_NMI + SAVE_ALL +.endm /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding @@ -227,6 +230,10 @@ POP_GS_EX .endm +.macro RESTORE_ALL_NMI pop=0 + RESTORE_REGS pop=\pop +.endm + .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) @@ -1161,7 +1168,7 @@ ENTRY(nmi) #endif pushl %eax # pt_regs->orig_ax - SAVE_ALL + SAVE_ALL_NMI ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1189,7 +1196,7 @@ ENTRY(nmi) .Lnmi_return: CHECK_AND_APPLY_ESPFIX - RESTORE_REGS 4 + RESTORE_ALL_NMI pop=4 jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1205,12 +1212,12 @@ ENTRY(nmi) pushl 16(%esp) .endr pushl %eax - SAVE_ALL + SAVE_ALL_NMI ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi - RESTORE_REGS + RESTORE_ALL_NMI lss 12+4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif From b92a165df17ee6e616e43107730f06bf6ecf5d8d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:47 +0200 Subject: [PATCH 12/60] x86/entry/32: Handle Entry from Kernel-Mode on Entry-Stack It is possible that the kernel is entered from kernel-mode and on the entry-stack. The most common way this happens is when an exception is triggered while loading the user-space segment registers on the kernel-to-userspace exit path. The segment loading needs to be done after the entry-stack switch, because the stack-switch needs kernel %fs for per_cpu access. When this happens, make sure to leave the kernel with the entry-stack again, so that the interrupted code-path runs on the right stack when switching to the user-cr3. Detect this condition on kernel-entry by checking CS.RPL and %esp, and if it happens, copy over the complete content of the entry stack to the task-stack. This needs to be done because once the exception handler is entereed, the task might be scheduled out or even migrated to a different CPU, so this cannot rely on the entry-stack contents. Leave a marker in the stack-frame to detect this condition on the exit path. On the exit path the copy is reversed, copy all of the remaining task-stack back to the entry-stack and switch to it. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-11-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 116 +++++++++++++++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 763592596727..9d6eceba0461 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -294,6 +294,9 @@ * copied there. So allocate the stack-frame on the task-stack and * switch to it before we do any copying. */ + +#define CS_FROM_ENTRY_STACK (1 << 31) + .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV @@ -316,6 +319,16 @@ /* Load top of task-stack into %edi */ movl TSS_entry2task_stack(%edi), %edi + /* + * Clear unused upper bits of the dword containing the word-sized CS + * slot in pt_regs in case hardware didn't clear it for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + + /* Special case - entry from kernel mode via entry stack */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lentry_from_kernel_\@ + /* Bytes to copy */ movl $PTREGS_SIZE, %ecx @@ -329,8 +342,8 @@ */ addl $(4 * 4), %ecx -.Lcopy_pt_regs_\@: #endif +.Lcopy_pt_regs_\@: /* Allocate frame on task-stack */ subl %ecx, %edi @@ -346,6 +359,56 @@ cld rep movsl + jmp .Lend_\@ + +.Lentry_from_kernel_\@: + + /* + * This handles the case when we enter the kernel from + * kernel-mode and %esp points to the entry-stack. When this + * happens we need to switch to the task-stack to run C code, + * but switch back to the entry-stack again when we approach + * iret and return to the interrupted code-path. This usually + * happens when we hit an exception while restoring user-space + * segment registers on the way back to user-space. + * + * When we switch to the task-stack here, we can't trust the + * contents of the entry-stack anymore, as the exception handler + * might be scheduled out or moved to another CPU. Therefore we + * copy the complete entry-stack to the task-stack and set a + * marker in the iret-frame (bit 31 of the CS dword) to detect + * what we've done on the iret path. + * + * On the iret path we copy everything back and switch to the + * entry-stack, so that the interrupted kernel code-path + * continues on the same stack it was interrupted with. + * + * Be aware that an NMI can happen anytime in this code. + * + * %esi: Entry-Stack pointer (same as %esp) + * %edi: Top of the task stack + */ + + /* Calculate number of bytes on the entry stack in %ecx */ + movl %esi, %ecx + + /* %ecx to the top of entry-stack */ + andl $(MASK_entry_stack), %ecx + addl $(SIZEOF_entry_stack), %ecx + + /* Number of bytes on the entry stack to %ecx */ + sub %esi, %ecx + + /* Mark stackframe as coming from entry stack */ + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + + /* + * %esi and %edi are unchanged, %ecx contains the number of + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate + * the stack-frame on task-stack and copy everything over + */ + jmp .Lcopy_pt_regs_\@ + .Lend_\@: .endm @@ -403,6 +466,56 @@ .Lend_\@: .endm +/* + * This macro handles the case when we return to kernel-mode on the iret + * path and have to switch back to the entry stack. + * + * See the comments below the .Lentry_from_kernel_\@ label in the + * SWITCH_TO_KERNEL_STACK macro for more details. + */ +.macro PARANOID_EXIT_TO_KERNEL_MODE + + /* + * Test if we entered the kernel with the entry-stack. Most + * likely we did not, because this code only runs on the + * return-to-kernel path. + */ + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + jz .Lend_\@ + + /* Unlikely slow-path */ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) + + /* Copy the remaining task-stack contents to entry-stack */ + movl %esp, %esi + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + + /* Bytes on the task-stack to ecx */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx + subl %esi, %ecx + + /* Allocate stack-frame on entry-stack */ + subl %ecx, %edi + + /* + * Save future stack-pointer, we must not switch until the + * copy is done, otherwise the NMI handler could destroy the + * contents of the task-stack we are about to copy. + */ + movl %edi, %ebx + + /* Do the copy */ + shrl $2, %ecx + cld + rep movsl + + /* Safe to switch to entry-stack now */ + movl %ebx, %esp + +.Lend_\@: +.endm /* * %eax: prev task * %edx: next task @@ -764,6 +877,7 @@ restore_all: restore_all_kernel: TRACE_IRQS_IRET + PARANOID_EXIT_TO_KERNEL_MODE RESTORE_REGS 4 jmp .Lirq_return From 929b44eb5739bf11d4a9bce85d7346bd955fc24d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:48 +0200 Subject: [PATCH 13/60] x86/entry/32: Simplify debug entry point The common exception entry code now handles the entry-from-sysenter stack situation and makes sure to leave with the same stack as it entered the kernel. So there is no need anymore for the special handling in the debug entry code. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-12-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 9d6eceba0461..dbf7d619dcd6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1226,41 +1226,12 @@ END(common_exception) ENTRY(debug) /* - * #DB can happen at the first instruction of - * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this - * happens, then we will be running on a very small stack. We - * need to detect this condition and switch to the thread - * stack before calling any C code at all. - * - * If you edit this code, keep in mind that NMIs can happen in here. + * Entry from sysenter is now handled in common_exception */ ASM_CLAC pushl $-1 # mark this as an int - - SAVE_ALL - ENCODE_FRAME_POINTER - xorl %edx, %edx # error code 0 - movl %esp, %eax # pt_regs pointer - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx - subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ - cmpl $SIZEOF_entry_stack, %ecx - jb .Ldebug_from_sysenter_stack - - TRACE_IRQS_OFF - call do_debug - jmp ret_from_exception - -.Ldebug_from_sysenter_stack: - /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - TRACE_IRQS_OFF - call do_debug - movl %ebx, %esp - jmp ret_from_exception + pushl $do_debug + jmp common_exception END(debug) /* From e464fb9f241ddf46815b31ca594af96f2699a78e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:49 +0200 Subject: [PATCH 14/60] x86/entry/32: Add PTI cr3 switch to non-NMI entry/exit points Add unconditional cr3 switches between user and kernel cr3 to all non-NMI entry and exit points. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-13-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 86 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index dbf7d619dcd6..60b28dfa00dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -77,6 +77,8 @@ #endif .endm +#define PTI_SWITCH_MASK (1 << PAGE_SHIFT) + /* * User gs save/restore * @@ -154,6 +156,33 @@ #endif /* CONFIG_X86_32_LAZY_GS */ +/* Unconditionally switch to user cr3 */ +.macro SWITCH_TO_USER_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + movl %cr3, \scratch_reg + orl $PTI_SWITCH_MASK, \scratch_reg + movl \scratch_reg, %cr3 +.Lend_\@: +.endm + +/* + * Switch to kernel cr3 if not already loaded and return current cr3 in + * \scratch_reg + */ +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + movl %cr3, \scratch_reg + /* Test if we are already on kernel CR3 */ + testl $PTI_SWITCH_MASK, \scratch_reg + jz .Lend_\@ + andl $(~PTI_SWITCH_MASK), \scratch_reg + movl \scratch_reg, %cr3 + /* Return original CR3 in \scratch_reg */ + orl $PTI_SWITCH_MASK, \scratch_reg +.Lend_\@: +.endm + .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS @@ -283,7 +312,6 @@ #endif /* CONFIG_X86_ESPFIX32 */ .endm - /* * Called with pt_regs fully populated and kernel segments loaded, * so we can access PER_CPU and use the integer registers. @@ -296,11 +324,19 @@ */ #define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + + /* + * %eax now contains the entry cr3 and we carry it forward in + * that register for the time this macro runs + */ + /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -370,7 +406,8 @@ * but switch back to the entry-stack again when we approach * iret and return to the interrupted code-path. This usually * happens when we hit an exception while restoring user-space - * segment registers on the way back to user-space. + * segment registers on the way back to user-space or when the + * sysenter handler runs with eflags.tf set. * * When we switch to the task-stack here, we can't trust the * contents of the entry-stack anymore, as the exception handler @@ -387,6 +424,7 @@ * * %esi: Entry-Stack pointer (same as %esp) * %edi: Top of the task stack + * %eax: CR3 on kernel entry */ /* Calculate number of bytes on the entry stack in %ecx */ @@ -402,6 +440,14 @@ /* Mark stackframe as coming from entry stack */ orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + /* + * Test the cr3 used to enter the kernel and add a marker + * so that we can switch back to it before iret. + */ + testl $PTI_SWITCH_MASK, %eax + jz .Lcopy_pt_regs_\@ + orl $CS_FROM_USER_CR3, PT_CS(%esp) + /* * %esi and %edi are unchanged, %ecx contains the number of * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate @@ -468,7 +514,7 @@ /* * This macro handles the case when we return to kernel-mode on the iret - * path and have to switch back to the entry stack. + * path and have to switch back to the entry stack and/or user-cr3 * * See the comments below the .Lentry_from_kernel_\@ label in the * SWITCH_TO_KERNEL_STACK macro for more details. @@ -514,6 +560,18 @@ /* Safe to switch to entry-stack now */ movl %ebx, %esp + /* + * We came from entry-stack and need to check if we also need to + * switch back to user cr3. + */ + testl $CS_FROM_USER_CR3, PT_CS(%esp) + jz .Lend_\@ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_USER_CR3), PT_CS(%esp) + + SWITCH_TO_USER_CR3 scratch_reg=%eax + .Lend_\@: .endm /* @@ -707,7 +765,20 @@ ENTRY(xen_sysenter_target) * 0(%ebp) arg6 */ ENTRY(entry_SYSENTER_32) + /* + * On entry-stack with all userspace-regs live - save and + * restore eflags and %eax to use it as scratch-reg for the cr3 + * switch. + */ + pushfl + pushl %eax + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + popl %eax + popfl + + /* Stack empty again, switch to task stack */ movl TSS_entry2task_stack(%esp), %esp + .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ @@ -786,6 +857,9 @@ ENTRY(entry_SYSENTER_32) /* Switch to entry stack */ movl %eax, %esp + /* Now ready to switch the cr3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + /* * Restore all flags except IF. (We restore IF separately because * STI gives a one-instruction window in which we won't be interrupted, @@ -866,7 +940,11 @@ restore_all: .Lrestore_all_notrace: CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code + /* Switch back to user CR3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + + /* Restore user state */ + RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization From b65bef400689ceee7108c2d47fb97ae91f4d1440 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:50 +0200 Subject: [PATCH 15/60] x86/entry/32: Add PTI CR3 switches to NMI handler code The NMI handler is special, as it needs to leave with the same CR3 as it was entered with. This is required because the NMI can happen within kernel context but with user CR3 already loaded, i.e. after switching to user CR3 but before returning to user space. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-14-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 60b28dfa00dc..b1541c74c71a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -210,8 +210,19 @@ .endm -.macro SAVE_ALL_NMI +.macro SAVE_ALL_NMI cr3_reg:req SAVE_ALL + + /* + * Now switch the CR3 when PTI is enabled. + * + * We can enter with either user or kernel cr3, the code will + * store the old cr3 in \cr3_reg and switches to the kernel cr3 + * if necessary. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg + +.Lend_\@: .endm /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The @@ -259,7 +270,23 @@ POP_GS_EX .endm -.macro RESTORE_ALL_NMI pop=0 +.macro RESTORE_ALL_NMI cr3_reg:req pop=0 + /* + * Now switch the CR3 when PTI is enabled. + * + * We enter with kernel cr3 and switch the cr3 to the value + * stored on \cr3_reg, which is either a user or a kernel cr3. + */ + ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI + + testl $PTI_SWITCH_MASK, \cr3_reg + jz .Lswitched_\@ + + /* User cr3 in \cr3_reg - write it to hardware cr3 */ + movl \cr3_reg, %cr3 + +.Lswitched_\@: + RESTORE_REGS pop=\pop .endm @@ -1331,7 +1358,7 @@ ENTRY(nmi) #endif pushl %eax # pt_regs->orig_ax - SAVE_ALL_NMI + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1359,7 +1386,7 @@ ENTRY(nmi) .Lnmi_return: CHECK_AND_APPLY_ESPFIX - RESTORE_ALL_NMI pop=4 + RESTORE_ALL_NMI cr3_reg=%edi pop=4 jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1375,12 +1402,12 @@ ENTRY(nmi) pushl 16(%esp) .endr pushl %eax - SAVE_ALL_NMI + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi - RESTORE_ALL_NMI + RESTORE_ALL_NMI cr3_reg=%edi lss 12+4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif From 252e1a0526304f0f3f6888fc09e81cb220f957f3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:51 +0200 Subject: [PATCH 16/60] x86/entry: Rename update_sp0 to update_task_stack The function does not update sp0 anymore but updates makes the task-stack visible for entry code. This is by either writing it to sp1 or by doing a hypercall. Rename the function to get rid of the misleading name. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-15-git-send-email-joro@8bytes.org --- arch/x86/include/asm/switch_to.h | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8bc2f70b452e..36bd243843d6 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -87,7 +87,7 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) #endif /* This is used when switching tasks or entering/exiting vm86 mode. */ -static inline void update_sp0(struct task_struct *task) +static inline void update_task_stack(struct task_struct *task) { /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..2924fd447e61 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - update_sp0(next_p); + update_task_stack(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 12bb445fb98d..476e3ddf8890 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ - update_sp0(next_p); + update_task_stack(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9d0b5af7db91..1c03e4aa6474 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - update_sp0(tsk); + update_task_stack(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - update_sp0(tsk); + update_task_stack(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) From 23b772883d1ddcf7fdf883614b88b2a6205db4da Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:52 +0200 Subject: [PATCH 17/60] x86/pgtable: Rename pti_set_user_pgd() to pti_set_user_pgtbl() The way page-table folding is implemented on 32 bit, these functions are not only setting, but also PUDs and even PMDs. Give the function a more generic name to reflect that. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-16-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable_64.h | 12 ++++++------ arch/x86/mm/pti.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 3c5385f9a88f..9406c4fe1d38 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -196,21 +196,21 @@ static inline bool pgdp_maps_userspace(void *__ptr) } #ifdef CONFIG_PAGE_TABLE_ISOLATION -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* * Take a PGD location (pgdp) and a pgd value that needs to be set there. * Populates the user and returns the resulting PGD that must be set in * the kernel copy of the page tables. */ -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { if (!static_cpu_has(X86_FEATURE_PTI)) return pgd; - return __pti_set_user_pgd(pgdp, pgd); + return __pti_set_user_pgtbl(pgdp, pgd); } #else -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } @@ -226,7 +226,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) } pgd = native_make_pgd(native_p4d_val(p4d)); - pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); *p4dp = native_make_p4d(native_pgd_val(pgd)); } @@ -237,7 +237,7 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pti_set_user_pgd(pgdp, pgd); + *pgdp = pti_set_user_pgtbl(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 9277e9ba92b5..71fba17c9d7c 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -117,7 +117,7 @@ enable: setup_force_cpu_cap(X86_FEATURE_PTI); } -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page From 7ffcf1497c8ab59a705bfafb7401876fd2f6f71e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:53 +0200 Subject: [PATCH 18/60] x86/pgtable/pae: Unshare kernel PMDs when PTI is enabled With PTI the per-process LDT must be mapped into the kernel address-space for each process, which requires separate kernel PMDs per PGD. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-17-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable-3level_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 6a59a6d0cc50..78038e057876 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -21,9 +21,10 @@ typedef union { #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_PARAVIRT -#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) +#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \ + (pv_info.shared_kernel_pmd))) #else -#define SHARED_KERNEL_PMD 1 +#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif /* From e3238faf20fb1b51a814497751398ab525a2c884 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:54 +0200 Subject: [PATCH 19/60] x86/pgtable/32: Allocate 8k page-tables when PTI is enabled Allocate a kernel and a user page-table root when PTI is enabled. Also allocate a full page per root for PAE because otherwise the bit to flip in CR3 to switch between them would be non-constant, which creates a lot of hassle. Keep that for a later optimization. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-18-git-send-email-joro@8bytes.org --- arch/x86/kernel/head_32.S | 20 +++++++++++++++----- arch/x86/mm/pgtable.c | 5 +++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index abe6df15a8fb..30f9cb2c0b55 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -512,11 +512,18 @@ ENTRY(initial_code) ENTRY(setup_once_ref) .long setup_once +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif /* * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE + .align PGD_ALIGN #ifdef CONFIG_X86_PAE .globl initial_pg_pmd initial_pg_pmd: @@ -526,14 +533,17 @@ initial_pg_pmd: initial_page_table: .fill 1024,4,0 #endif + .align PGD_ALIGN initial_pg_fixmap: .fill 1024,4,0 +.globl swapper_pg_dir + .align PGD_ALIGN +swapper_pg_dir: + .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 .globl empty_zero_page empty_zero_page: .fill 4096,1,0 -.globl swapper_pg_dir -swapper_pg_dir: - .fill 1024,4,0 EXPORT_SYMBOL(empty_zero_page) /* @@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE + .align PGD_ALIGN ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 47b5951e592b..db6fb7740bf7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -343,7 +343,8 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, + PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -355,7 +356,7 @@ static inline pgd_t *_pgd_alloc(void) static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } From 8372d66865deb45ee3ec21401a9c80f231b728c8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:55 +0200 Subject: [PATCH 20/60] x86/pgtable: Move pgdp kernel/user conversion functions to pgtable.h Make them available on 32 bit and clone_pgd_range() happy. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-19-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable.h | 49 +++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 49 ------------------------------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5715647fc4fe..eb474329d751 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1155,6 +1155,55 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9406c4fe1d38..4adba19c7bf6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -132,55 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } -#ifdef CONFIG_PAGE_TABLE_ISOLATION -/* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages - * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and - * the user one is in the last 4k. To switch between them, you - * just need to flip the 12th bit in their addresses. - */ -#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT - -/* - * This generates better code than the inline assembly in - * __set_bit(). - */ -static inline void *ptr_set_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr |= BIT(bit); - return (void *)__ptr; -} -static inline void *ptr_clear_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr &= ~BIT(bit); - return (void *)__ptr; -} - -static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) -{ - return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) -{ - return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) -{ - return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) -{ - return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ - /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. From fcbbd977572cfe5a3dcc97d663bf7480431a07ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:56 +0200 Subject: [PATCH 21/60] x86/pgtable: Move pti_set_user_pgtbl() to pgtable.h There it is also usable from 32 bit code. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-20-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index eb474329d751..cc117161f13d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -640,8 +640,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgtbl(pgdp, pgd); +} +#else /* CONFIG_PAGE_TABLE_ISOLATION */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + #endif /* __ASSEMBLY__ */ + #ifdef CONFIG_X86_32 # include #else From 76e258add7b653b60037ee4b25ebc40da6a35c4a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:57 +0200 Subject: [PATCH 22/60] x86/pgtable: Move two more functions from pgtable_64.h to pgtable.h These two functions are required for PTI on 32 bit: * pgdp_maps_userspace() * pgd_large() Also re-implement pgdp_maps_userspace() so that it will work on 64 and 32 bit kernels. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-21-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable-2level_types.h | 3 ++ arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 15 +++++++++ arch/x86/include/asm/pgtable_32.h | 2 -- arch/x86/include/asm/pgtable_64.h | 36 --------------------- arch/x86/include/asm/pgtable_64_types.h | 2 ++ 6 files changed, 21 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index f982ef808e7e..6deb6cd236e3 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -35,4 +35,7 @@ typedef union { #define PTRS_PER_PTE 1024 +/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */ +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) + #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 78038e057876..858358a82b14 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -46,5 +46,6 @@ typedef union { #define PTRS_PER_PTE 512 #define MAX_POSSIBLE_PHYSMEM_BITS 36 +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cc117161f13d..e39088cb59ab 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1177,6 +1177,21 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); +} + +static inline int pgd_large(pgd_t pgd) { return 0; } #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 88a056b01db4..b3ec519e3982 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); -static inline int pgd_large(pgd_t pgd) { return 0; } - /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4adba19c7bf6..acb6970e7bcf 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -132,41 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * - * Returns true for parts of the PGD that map userspace and - * false for the parts that map the kernel. - */ -static inline bool pgdp_maps_userspace(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); -} - -#ifdef CONFIG_PAGE_TABLE_ISOLATION -pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); - -/* - * Take a PGD location (pgdp) and a pgd value that needs to be set there. - * Populates the user and returns the resulting PGD that must be set in - * the kernel copy of the page tables. - */ -static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) -{ - if (!static_cpu_has(X86_FEATURE_PTI)) - return pgd; - return __pti_set_user_pgtbl(pgdp, pgd); -} -#else -static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) -{ - return pgd; -} -#endif - static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; @@ -206,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Level 4 access. */ -static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 054765ab2da2..066e0ab9ffab 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -153,4 +153,6 @@ extern unsigned int ptrs_per_p4d; #define EARLY_DYNAMIC_PAGE_TABLES 64 +#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ From 6c0df8689494e1fefa685377676fa8192291a0eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:58 +0200 Subject: [PATCH 23/60] x86/mm/pae: Populate valid user PGD entries Generic page-table code populates all non-leaf entries with _KERNPG_TABLE bits set. This is fine for all paging modes except PAE. In PAE mode only a subset of the bits is allowed to be set. Make sure to only set allowed bits by masking out the reserved bits. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-22-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable_types.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 99fff853c944..b64acb08a62b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -50,6 +50,7 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +#ifdef CONFIG_X86_PAE + +/* + * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't + * use it here. + */ + +#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) +#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) + +/* + * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. + * All other bits are Reserved MBZ + */ +#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ + _PAGE_PWT | _PAGE_PCD | \ + _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) + +#else +/* No need to mask any bits for !PAE */ +#define PGD_ALLOWED_BITS (~0ULL) +#endif + static inline pgd_t native_make_pgd(pgdval_t val) { - return (pgd_t) { val }; + return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { - return pgd.pgd; + return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) From 9b7b8bbd7f6ba4ef7caa5a078ead70237e12d045 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:59 +0200 Subject: [PATCH 24/60] x86/mm/pae: Populate the user page-table with user pgd's When a PGD entry is populated, make sure to populate it in the user page-table too. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-23-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable-3level.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f24df59c40b2..f2ca3139ca22 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); +#endif set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); } @@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) { union split_pud res, *orig = (union split_pud *)pudp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); +#endif + /* xchg acts as a barrier before setting of the high bits */ res.pud_low = xchg(&orig->pud_low, 0); res.pud_high = orig->pud_high; From 1f40a46cf47c12d93a5ad9dccd82bd36ff8f956a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:00 +0200 Subject: [PATCH 25/60] x86/mm/legacy: Populate the user page-table with user pgd's Also populate the user-spage pgd's in the user page-table. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-24-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable-2level.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 685ffe8a0eaf..c399ea5eea41 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd); +#endif *pmdp = pmd; } @@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0)); +#endif return __pmd(xchg((pmdval_t *)xp, 0)); } #else @@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #ifdef CONFIG_SMP static inline pud_t native_pudp_get_and_clear(pud_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0)); +#endif return __pud(xchg((pudval_t *)xp, 0)); } #else From 935232ce28dfabff1171e5a7113b2d865fa9ee63 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:01 +0200 Subject: [PATCH 26/60] x86/mm/pti: Add an overflow check to pti_clone_pmds() The addr counter will overflow if the last PMD of the address space is cloned, resulting in an endless loop. Check for that and bail out of the loop when it happens. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-25-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 71fba17c9d7c..79217868dd13 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -297,6 +297,10 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) p4d_t *p4d; pud_t *pud; + /* Overflow check */ + if (addr < start) + break; + pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; From 2c1b9fbe83412598d2dccdd448147336b085e0c6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:02 +0200 Subject: [PATCH 27/60] x86/mm/pti: Define X86_CR3_PTI_PCID_USER_BIT on x86_32 Move it out of the X86_64 specific processor defines so that its visible for 32bit too. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Reviewed-by: Andy Lutomirski Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-26-git-send-email-joro@8bytes.org --- arch/x86/include/asm/processor-flags.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 625a52a5594f..02c2cbda4a74 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -39,10 +39,6 @@ #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define X86_CR3_PTI_PCID_USER_BIT 11 -#endif - #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save @@ -53,4 +49,8 @@ #define CR3_NOFLUSH 0 #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_PCID_USER_BIT 11 +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ From f94560cd6b5117f8913f4c42f4d9a405c26ddc1c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:03 +0200 Subject: [PATCH 28/60] x86/mm/pti: Clone CPU_ENTRY_AREA on PMD level on x86_32 Cloning on the P4D level would clone the complete kernel address space into the user-space page-tables for PAE kernels. Cloning on PMD level is fine for PAE and legacy paging. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-27-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 79217868dd13..a594e3b6401a 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -348,6 +348,7 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) } } +#ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. @@ -371,6 +372,25 @@ static void __init pti_clone_user_shared(void) pti_clone_p4d(CPU_ENTRY_AREA_BASE); } +#else /* CONFIG_X86_64 */ + +/* + * On 32 bit PAE systems with 1GB of Kernel address space there is only + * one pgd/p4d for the whole kernel. Cloning that would map the whole + * address space into the user page-tables, making PTI useless. So clone + * the page-table on the PMD level to prevent that. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned long start, end; + + start = CPU_ENTRY_AREA_BASE; + end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); + + pti_clone_pmds(start, end, 0); +} +#endif /* CONFIG_X86_64 */ + /* * Clone the ESPFIX P4D into the user space visible page table */ From 39d668e04edad25abe184fb329ce35a131146ee5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:04 +0200 Subject: [PATCH 29/60] x86/mm/pti: Make pti_clone_kernel_text() compile on 32 bit The pti_clone_kernel_text() function references __end_rodata_hpage_align, which is only present on x86-64. This makes sense as the end of the rodata section is not huge-page aligned on 32 bit. Nevertheless a symbol is required for the function that points at the right address for both 32 and 64 bit. Introduce __end_rodata_aligned for that purpose and use it in pti_clone_kernel_text(). Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-28-git-send-email-joro@8bytes.org --- arch/x86/include/asm/sections.h | 1 + arch/x86/kernel/vmlinux.lds.S | 17 ++++++++++------- arch/x86/mm/pti.c | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..4a911a382ade 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,6 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; +extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5e1458f609a1..8bde0a419f86 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -55,19 +55,22 @@ jiffies_64 = jiffies; * so we can enable protection checks as well as retain 2MB large page * mappings for kernel text. */ -#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_RODATA_END \ +#define X86_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ - __end_rodata_hpage_align = .; + __end_rodata_hpage_align = .; \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); #else -#define X64_ALIGN_RODATA_BEGIN -#define X64_ALIGN_RODATA_END +#define X86_ALIGN_RODATA_BEGIN +#define X86_ALIGN_RODATA_END \ + . = ALIGN(PAGE_SIZE); \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END @@ -141,9 +144,9 @@ SECTIONS /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); - X64_ALIGN_RODATA_BEGIN + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_RODATA_END + X86_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a594e3b6401a..453d23760941 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -470,7 +470,7 @@ void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long end = (unsigned long)__end_rodata_aligned; if (!pti_kernel_image_global_ok()) return; From 1ac228a7c87f697d1d01eb6362a6b5246705b0dd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:05 +0200 Subject: [PATCH 30/60] x86/mm/pti: Keep permissions when cloning kernel text in pti_clone_kernel_text() Mapping the kernel text area to user-space makes only sense if it has the same permissions as in the kernel page-table. If permissions are different this will cause a TLB reload when using the kernel page-table, which is as good as not mapping it at all. On 64-bit kernels this patch makes no difference, as the whole range cloned by pti_clone_kernel_text() is mapped RO anyway. On 32 bit there are writeable mappings in the range, so just keep the permissions as they are. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-29-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 453d23760941..e41ee93c430d 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -482,7 +482,7 @@ void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end, _PAGE_RW); + pti_clone_pmds(start, end, 0); } /* From b976690f5db26fbc7c2be413bfa0fbd270547a94 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:06 +0200 Subject: [PATCH 31/60] x86/mm/pti: Introduce pti_finalize() Introduce a new function to finalize the kernel mappings for the userspace page-table after all ro/nx protections have been applied to the kernel mappings. Also move the call to pti_clone_kernel_text() to that function so that it will run on 32 bit kernels too. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-30-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pti.h | 3 +-- arch/x86/mm/init_64.c | 6 ------ arch/x86/mm/pti.c | 14 +++++++++++++- include/linux/pti.h | 1 + init/main.c | 7 +++++++ 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 38a17f1d5c9d..5df09a0b80b8 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -6,10 +6,9 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); -extern void pti_clone_kernel_text(void); +extern void pti_finalize(void); #else static inline void pti_check_boottime_disable(void) { } -static inline void pti_clone_kernel_text(void) { } #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a688617c727e..9b19f9a8948e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1291,12 +1291,6 @@ void mark_rodata_ro(void) (unsigned long) __va(__pa_symbol(_sdata))); debug_checkwx(); - - /* - * Do this after all of the manipulation of the - * kernel text page tables are complete. - */ - pti_clone_kernel_text(); } int kern_addr_valid(unsigned long addr) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index e41ee93c430d..fcfb815d420f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -462,7 +462,7 @@ static inline bool pti_kernel_image_global_ok(void) * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ -void pti_clone_kernel_text(void) +static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally @@ -526,3 +526,15 @@ void __init pti_init(void) pti_setup_espfix64(); pti_setup_vsyscall(); } + +/* + * Finalize the kernel mappings in the userspace page-table. + */ +void pti_finalize(void) +{ + /* + * Do this after all of the manipulation of the + * kernel text page tables are complete. + */ + pti_clone_kernel_text(); +} diff --git a/include/linux/pti.h b/include/linux/pti.h index 0174883a935a..1a941efcaa62 100644 --- a/include/linux/pti.h +++ b/include/linux/pti.h @@ -6,6 +6,7 @@ #include #else static inline void pti_init(void) { } +static inline void pti_finalize(void) { } #endif #endif diff --git a/init/main.c b/init/main.c index 3b4ada11ed52..fcfef46c935a 100644 --- a/init/main.c +++ b/init/main.c @@ -1065,6 +1065,13 @@ static int __ref kernel_init(void *unused) jump_label_invalidate_initmem(); free_initmem(); mark_readonly(); + + /* + * Kernel mappings are now finalized - update the userspace page-table + * to finalize PTI. + */ + pti_finalize(); + system_state = SYSTEM_RUNNING; numa_default_policy(); From ba0364e260ab37c02975557dbecc014a26072236 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:07 +0200 Subject: [PATCH 32/60] x86/mm/pti: Clone entry-text again in pti_finalize() The mapping for entry-text might have changed in the kernel after it was cloned to the user page-table. Clone again to update the user page-table to bring the mapping in sync with the kernel again. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-31-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index fcfb815d420f..a536ecc91847 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -404,7 +404,7 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry and irqentry text and force it RO. */ -static void __init pti_clone_entry_text(void) +static void pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, (unsigned long) __irqentry_text_end, @@ -528,13 +528,18 @@ void __init pti_init(void) } /* - * Finalize the kernel mappings in the userspace page-table. + * Finalize the kernel mappings in the userspace page-table. Some of the + * mappings for the kernel image might have changed since pti_init() + * cloned them. This is because parts of the kernel image have been + * mapped RO and/or NX. These changes need to be cloned again to the + * userspace page-table. */ void pti_finalize(void) { /* - * Do this after all of the manipulation of the - * kernel text page tables are complete. + * We need to clone everything (again) that maps parts of the + * kernel image. */ + pti_clone_entry_text(); pti_clone_kernel_text(); } From 4e8537e4a7a15402b87c424b22c25c9e59681d16 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:08 +0200 Subject: [PATCH 33/60] x86/mm/dump_pagetables: Define INIT_PGD Define INIT_PGD to point to the correct initial page-table for 32 and 64 bit and use it where needed. This fixes the build on 32 bit with CONFIG_PAGE_TABLE_ISOLATION enabled. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-32-git-send-email-joro@8bytes.org --- arch/x86/mm/dump_pagetables.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2f3c9196b834..e6fd0cdef9ad 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = { [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD ((pgd_t *) &init_top_pgt) + #else /* CONFIG_X86_64 */ enum address_markers_idx { @@ -139,6 +141,8 @@ static struct addr_marker address_markers[] = { [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD (swapper_pg_dir) + #endif /* !CONFIG_X86_64 */ /* Multipliers for offsets within the PTEs */ @@ -496,11 +500,7 @@ static inline bool is_hypervisor_range(int idx) static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx, bool dmesg) { -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_top_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif + pgd_t *start = INIT_PGD; pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -566,7 +566,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); static void ptdump_walk_user_pgd_level_checkwx(void) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - pgd_t *pgd = (pgd_t *) &init_top_pgt; + pgd_t *pgd = INIT_PGD; if (!static_cpu_has(X86_FEATURE_PTI)) return; From f59dbe9ca6707eb7ffd0e24359085651c2d7df48 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:09 +0200 Subject: [PATCH 34/60] x86/pgtable/pae: Use separate kernel PMDs for user page-table When PTI is enabled, separate kernel PMDs in the user page-table are required to map the per-process LDT for user-space. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-33-git-send-email-joro@8bytes.org --- arch/x86/mm/pgtable.c | 100 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index db6fb7740bf7..8e4e63d46d81 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd) */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +/* + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. + */ +#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) + void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); @@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 - +#define PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ -static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) + for (i = 0; i < count; i++) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); @@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) } } -static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; @@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - for(i = 0; i < PREALLOCATED_PMDS; i++) { + for (i = 0; i < count; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(gfp); if (!pmd) failed = true; @@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) } if (failed) { - free_pmds(mm, pmds); + free_pmds(mm, pmds, count); return -ENOMEM; } @@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) * preallocate which never got a corresponding vma will need to be * freed manually. */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + *pgdp = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) { - pgd_t pgd = pgdp[i]; + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); +#ifdef CONFIG_PAGE_TABLE_ISOLATION - pgdp[i] = native_make_pgd(0); + if (!static_cpu_has(X86_FEATURE_PTI)) + return; - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - mm_dec_nr_pmds(mm); - } - } + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) @@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. @@ -376,6 +431,7 @@ static inline void _pgd_free(pgd_t *pgd) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; + pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(); @@ -385,12 +441,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(mm, pmds) != 0) + if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (paravirt_pgd_alloc(mm) != 0) + if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they @@ -400,13 +459,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; +out_free_user_pmds: + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(mm, pmds); + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: From f3e48e546c42e31c0c095a6f917a4ad64668608c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:10 +0200 Subject: [PATCH 35/60] x86/ldt: Reserve address-space range on 32 bit for the LDT Reserve 2MB/4MB of address-space for mapping the LDT to user-space on 32 bit PTI kernels. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-34-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable_32_types.h | 7 +++++-- arch/x86/mm/dump_pagetables.c | 9 +++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index d9a001a4a872..7297810747cf 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -50,13 +50,16 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ & PMD_MASK) -#define PKMAP_BASE \ +#define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +#define PKMAP_BASE \ + ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) +# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e6fd0cdef9ad..ccd92c4da57c 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -122,6 +122,9 @@ enum address_markers_idx { VMALLOC_END_NR, #ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, #endif CPU_ENTRY_AREA_NR, FIXADDR_START_NR, @@ -135,6 +138,9 @@ static struct addr_marker address_markers[] = { [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, #ifdef CONFIG_HIGHMEM [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, @@ -609,6 +615,9 @@ static int __init pt_dump_init(void) # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; +# ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +# endif #endif return 0; } From 8195d869d118bc30bf0be8d0c5d8849d6f58529b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:11 +0200 Subject: [PATCH 36/60] x86/ldt: Define LDT_END_ADDR It marks the end of the address-space range reserved for the LDT. The LDT-code will use it when unmapping the LDT for user-space. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-35-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable_32_types.h | 2 ++ arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/kernel/ldt.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 7297810747cf..b0bc0fff5f1f 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -53,6 +53,8 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE) + #define PKMAP_BASE \ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 066e0ab9ffab..04edd2d58211 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY_L5 -112UL #define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) #define __VMALLOC_BASE_L4 0xffffc90000000000UL #define __VMALLOC_BASE_L5 0xffa0000000000000UL diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c9b14020f4dd..e921b3d494d5 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -206,7 +206,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) #ifdef CONFIG_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; - unsigned long end = start + (1UL << PGDIR_SHIFT); + unsigned long end = LDT_END_ADDR; if (!static_cpu_has(X86_FEATURE_PTI)) return; From 9bae3197e15dd5e03ce8e237db6fe4486b08a775 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:12 +0200 Subject: [PATCH 37/60] x86/ldt: Split out sanity check in map_ldt_struct() This splits out the mapping sanity check and the actual mapping of the LDT to user-space from the map_ldt_struct() function in a way so that it is re-usable for PAE paging. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-36-git-send-email-joro@8bytes.org --- arch/x86/kernel/ldt.c | 82 ++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index e921b3d494d5..69af9a0d57b7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -100,6 +100,49 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. @@ -115,9 +158,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - bool is_vmalloc, had_top_level_entry; unsigned long va; + bool is_vmalloc; spinlock_t *ptl; pgd_t *pgd; int i; @@ -131,13 +173,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) */ WARN_ON(ldt->slot != -1); + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + /* * Did we already have the top level entry allocated? We can't * use pgd_none() for this because it doens't do anything on * 4-level page table kernels. */ pgd = pgd_offset(mm, LDT_BASE_ADDR); - had_top_level_entry = (pgd->pgd != 0); is_vmalloc = is_vmalloc_addr(ldt->entries); @@ -172,35 +216,25 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) pte_unmap_unlock(ptep, ptl); } - if (mm->context.ldt) { - /* - * We already had an LDT. The top-level entry should already - * have been allocated and synchronized with the usermode - * tables. - */ - WARN_ON(!had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) - WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); - } else { - /* - * This is the first time we're mapping an LDT for this process. - * Sync the pgd to the usermode tables. - */ - WARN_ON(had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) { - WARN_ON(kernel_to_user_pgdp(pgd)->pgd); - set_pgd(kernel_to_user_pgdp(pgd), *pgd); - } - } + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); ldt->slot = slot; -#endif return 0; } +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_PAGE_TABLE_ISOLATION From 6df934b92a549cb3badb6d576f71aeb133e2f110 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:13 +0200 Subject: [PATCH 38/60] x86/ldt: Enable LDT user-mapping for PAE This adds the needed special case for PAE to get the LDT mapped into the user page-table when PTI is enabled. The big difference to the other paging modes is that on PAE there is no full top-level PGD entry available for the LDT, but only a PMD entry. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-37-git-send-email-joro@8bytes.org --- arch/x86/include/asm/mmu_context.h | 5 --- arch/x86/kernel/ldt.c | 53 ++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index bbc796eb0a3b..eeeb9289c764 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -71,12 +71,7 @@ struct ldt_struct { static inline void *ldt_slot_va(int slot) { -#ifdef CONFIG_X86_64 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); -#else - BUG(); - return (void *)fix_to_virt(FIX_HOLE); -#endif } /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 69af9a0d57b7..733e6ace0fa4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -126,6 +126,57 @@ static void do_sanity_check(struct mm_struct *mm, } } +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); @@ -143,6 +194,8 @@ static void sanity_check_ldt_mapping(struct mm_struct *mm) do_sanity_check(mm, had_kernel, had_user); } +#endif /* CONFIG_X86_PAE */ + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. From 7757d607c6b31867777de42e1fb0210b9c5d8b70 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:14 +0200 Subject: [PATCH 39/60] x86/pti: Allow CONFIG_PAGE_TABLE_ISOLATION for x86_32 Allow PTI to be compiled on x86_32. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-38-git-send-email-joro@8bytes.org --- security/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/Kconfig b/security/Kconfig index c4302067a3ad..afa91c6f06bb 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -57,7 +57,7 @@ config SECURITY_NETWORK config PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y - depends on X86_64 && !UML + depends on X86 && !UML help This feature reduces the number of hardware side channels by ensuring that the majority of kernel addresses are not mapped From 5e8105950a8b3e03e805299b4d05020ee4eda31a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:15 +0200 Subject: [PATCH 40/60] x86/mm/pti: Add Warning when booting on a PCID capable CPU Warn the user in case the performance can be significantly improved by switching to a 64-bit kernel. Suggested-by: Andy Lutomirski Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-39-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a536ecc91847..7b1c85759005 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -517,6 +517,28 @@ void __init pti_init(void) pr_info("enabled\n"); +#ifdef CONFIG_X86_32 + /* + * We check for X86_FEATURE_PCID here. But the init-code will + * clear the feature flag on 32 bit because the feature is not + * supported on 32 bit anyway. To print the warning we need to + * check with cpuid directly again. + */ + if (cpuid_ecx(0x1) && BIT(17)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "************************************************************\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "************************************************************\n"); + } +#endif + pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ From 97193702c6d353e12aefc0fb2f73a98ca421cd56 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:16 +0200 Subject: [PATCH 41/60] x86/entry/32: Add debug code to check entry/exit CR3 Add code to check whether the kernel is entered and left with the correct CR3 and make it depend on CONFIG_DEBUG_ENTRY. This is needed because there is no NX protection of user-addresses in the kernel-CR3 on x86-32 and that type of bug would not be detected otherwise. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-40-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b1541c74c71a..010cdb41e3c7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -166,6 +166,24 @@ .Lend_\@: .endm +.macro BUG_IF_WRONG_CR3 no_user_check=0 +#ifdef CONFIG_DEBUG_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + .if \no_user_check == 0 + /* coming from usermode? */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lend_\@ + .endif + /* On user-cr3? */ + movl %cr3, %eax + testl $PTI_SWITCH_MASK, %eax + jnz .Lend_\@ + /* From userspace with kernel cr3 - BUG */ + ud2 +.Lend_\@: +#endif +.endm + /* * Switch to kernel cr3 if not already loaded and return current cr3 in * \scratch_reg @@ -213,6 +231,8 @@ .macro SAVE_ALL_NMI cr3_reg:req SAVE_ALL + BUG_IF_WRONG_CR3 + /* * Now switch the CR3 when PTI is enabled. * @@ -224,6 +244,7 @@ .Lend_\@: .endm + /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding @@ -287,6 +308,8 @@ .Lswitched_\@: + BUG_IF_WRONG_CR3 + RESTORE_REGS pop=\pop .endm @@ -357,6 +380,8 @@ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + BUG_IF_WRONG_CR3 + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax /* @@ -799,6 +824,7 @@ ENTRY(entry_SYSENTER_32) */ pushfl pushl %eax + BUG_IF_WRONG_CR3 no_user_check=1 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax popl %eax popfl @@ -893,6 +919,7 @@ ENTRY(entry_SYSENTER_32) * whereas POPF does not.) */ btrl $X86_EFLAGS_IF_BIT, (%esp) + BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax @@ -970,6 +997,8 @@ restore_all: /* Switch back to user CR3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax + BUG_IF_WRONG_CR3 + /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: @@ -983,6 +1012,7 @@ restore_all: restore_all_kernel: TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 RESTORE_REGS 4 jmp .Lirq_return @@ -990,6 +1020,19 @@ restore_all_kernel: ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error + +#ifdef CONFIG_DEBUG_ENTRY + /* + * The stack-frame here is the one that iret faulted on, so its a + * return-to-user frame. We are on kernel-cr3 because we come here from + * the fixup code. This confuses the CR3 checker, so switch to user-cr3 + * as the checker expects it. + */ + pushl %eax + SWITCH_TO_USER_CR3 scratch_reg=%eax + popl %eax +#endif + jmp common_exception .previous _ASM_EXTABLE(.Lirq_return, iret_exc) From b2b7d986a89b6c94b1331a909de1217214fb08c1 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Fri, 20 Jul 2018 08:06:31 +0800 Subject: [PATCH 42/60] x86/pti: Check the return value of pti_user_pagetable_walk_p4d() pti_user_pagetable_walk_p4d() can return NULL, so the return value should be checked to prevent a NULL pointer dereference. Add the check and a warning when the P4D allocation fails. Signed-off-by: Jiang Biao Signed-off-by: Thomas Gleixner Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: hpa@zytor.com Cc: albcamus@gmail.com Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1532045192-49622-1-git-send-email-jiang.biao2@zte.com.cn --- arch/x86/mm/pti.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 7b1c85759005..001ee6b0619e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); - if (!new_p4d_page) + if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); @@ -195,9 +195,13 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + p4d_t *p4d; pud_t *pud; + p4d = pti_user_pagetable_walk_p4d(address); + if (!p4d) + return NULL; + BUILD_BUG_ON(p4d_large(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); @@ -359,6 +363,9 @@ static void __init pti_clone_p4d(unsigned long addr) pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); + if (!user_p4d) + return; + kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; From 8c934e01a7ce685d98e970880f5941d79272c654 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Fri, 20 Jul 2018 08:06:32 +0800 Subject: [PATCH 43/60] x86/pti: Check the return value of pti_user_pagetable_walk_pmd() pti_user_pagetable_walk_pmd() can return NULL, so the return value should be checked to prevent a NULL pointer dereference. Add the check and a warning when the PMD allocation fails. Signed-off-by: Jiang Biao Signed-off-by: Thomas Gleixner Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: hpa@zytor.com Cc: albcamus@gmail.com Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1532045192-49622-2-git-send-email-jiang.biao2@zte.com.cn --- arch/x86/mm/pti.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 001ee6b0619e..bcf35dac1920 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -205,7 +205,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) BUILD_BUG_ON(p4d_large(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); - if (!new_pud_page) + if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); @@ -219,7 +219,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); - if (!new_pmd_page) + if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); @@ -241,9 +241,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pmd_t *pmd; pte_t *pte; + pmd = pti_user_pagetable_walk_pmd(address); + if (!pmd) + return NULL; + /* We can't do anything sensible if we hit a large mapping. */ if (pmd_large(*pmd)) { WARN_ON(1); From 77754cfa09a6c528c38cbca9ee4cc4f7cf6ad6f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Jul 2018 18:22:22 +0200 Subject: [PATCH 44/60] perf/core: Make sure the ring-buffer is mapped in all page-tables The ring-buffer is accessed in the NMI handler, so it's better to avoid faulting on it. Sync the vmalloc range with all page-tables in system to make sure everyone has it mapped. This fixes a WARN_ON_ONCE() that can be triggered with PTI enabled on x86-32: WARNING: CPU: 4 PID: 0 at arch/x86/mm/fault.c:320 vmalloc_fault+0x220/0x230 This triggers because with PTI enabled on an PAE kernel the PMDs are no longer shared between the page-tables, so the vmalloc changes do not propagate automatically. Note: Andy said rightfully that we should try to fix the vmalloc code for that case, but that's not a hot fix for the issue at hand. Fixes: 7757d607c6b3 ("x86/pti: Allow CONFIG_PAGE_TABLE_ISOLATION for x86_32") Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532103744-31902-2-git-send-email-joro@8bytes.org --- kernel/events/ring_buffer.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5d3cf407e374..df2d8cf0072c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -814,6 +814,13 @@ static void rb_free_work(struct work_struct *work) vfree(base); kfree(rb); + + /* + * FIXME: PAE workaround for vmalloc_fault(): Make sure buffer is + * unmapped in all page-tables. + */ + if (IS_ENABLED(CONFIG_X86_PAE)) + vmalloc_sync_all(); } void rb_free(struct ring_buffer *rb) @@ -840,6 +847,15 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) if (!all_buf) goto fail_all_buf; + /* + * FIXME: PAE workaround for vmalloc_fault(): The buffer is + * accessed in NMI handlers, make sure it is mapped in all + * page-tables in the system so that we don't fault on the range in + * an NMI handler. + */ + if (IS_ENABLED(CONFIG_X86_PAE)) + vmalloc_sync_all(); + rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; if (nr_pages) { From d5e84c21dbf5ea458897f88346dc979909eed913 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Jul 2018 18:22:23 +0200 Subject: [PATCH 45/60] x86/entry/32: Check for VM86 mode in slow-path check The SWITCH_TO_KERNEL_STACK macro only checks for CPL == 0 to go down the slow and paranoid entry path. The problem is that this check also returns true when coming from VM86 mode. This is not a problem by itself, as the paranoid path handles VM86 stack-frames just fine, but it is not necessary as the normal code path handles VM86 mode as well (and faster). Extend the check to include VM86 mode. This also makes an optimization of the paranoid path possible. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532103744-31902-3-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 010cdb41e3c7..2767c625a52c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -414,8 +414,16 @@ andl $(0x0000ffff), PT_CS(%esp) /* Special case - entry from kernel mode via entry stack */ - testl $SEGMENT_RPL_MASK, PT_CS(%esp) - jz .Lentry_from_kernel_\@ +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS + movb PT_CS(%esp), %cl + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx +#else + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx +#endif + cmpl $USER_RPL, %ecx + jb .Lentry_from_kernel_\@ /* Bytes to copy */ movl $PTREGS_SIZE, %ecx From 6863ea0cda8725072522cd78bda332d9a0b73150 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Jul 2018 17:48:01 +0200 Subject: [PATCH 46/60] x86/mm: Remove in_nmi() warning from vmalloc_fault() It is perfectly okay to take page-faults, especially on the vmalloc area while executing an NMI handler. Remove the warning. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: David H. Gutteridge Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532533683-5988-2-git-send-email-joro@8bytes.org --- arch/x86/mm/fault.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2aafa6ab6103..db1c042e9853 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -317,8 +317,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Synchronize this task's top level page-table * with the 'reference' page table. From 0e664eee65337082be49fbbd2ee24aa0d111d0f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Jul 2018 17:48:02 +0200 Subject: [PATCH 47/60] Revert "perf/core: Make sure the ring-buffer is mapped in all page-tables" This reverts commit 77754cfa09a6c528c38cbca9ee4cc4f7cf6ad6f2. The patch was necessary to silence a WARN_ON_ONCE(in_nmi()) that triggered in the vmalloc_fault() function when PTI was enabled on x86-32. Faulting in an NMI handler turned out to be safe and the warning in vmalloc_fault() is gone now. So the above patch can be reverted. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: David H. Gutteridge Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532533683-5988-3-git-send-email-joro@8bytes.org --- kernel/events/ring_buffer.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index df2d8cf0072c..5d3cf407e374 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -814,13 +814,6 @@ static void rb_free_work(struct work_struct *work) vfree(base); kfree(rb); - - /* - * FIXME: PAE workaround for vmalloc_fault(): Make sure buffer is - * unmapped in all page-tables. - */ - if (IS_ENABLED(CONFIG_X86_PAE)) - vmalloc_sync_all(); } void rb_free(struct ring_buffer *rb) @@ -847,15 +840,6 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) if (!all_buf) goto fail_all_buf; - /* - * FIXME: PAE workaround for vmalloc_fault(): The buffer is - * accessed in NMI handlers, make sure it is mapped in all - * page-tables in the system so that we don't fault on the range in - * an NMI handler. - */ - if (IS_ENABLED(CONFIG_X86_PAE)) - vmalloc_sync_all(); - rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; if (nr_pages) { From ca38dc8f2724d101038b1205122c93a1c7f38f11 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Jul 2018 17:48:03 +0200 Subject: [PATCH 48/60] x86/kexec: Allocate 8k PGDs for PTI Fuzzing the PTI-x86-32 code with trinity showed unhandled kernel paging request oops-messages that looked a lot like silent data corruption. Lot's of debugging and testing lead to the kexec-32bit code, which is still allocating 4k PGDs when PTI is enabled. But since it uses native_set_pud() to build the page-table, it will unevitably call into __pti_set_user_pgtbl(), which writes beyond the allocated 4k page. Use PGD_ALLOCATION_ORDER to allocate PGDs in the kexec code to fix the issue. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: David H. Gutteridge Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532533683-5988-4-git-send-email-joro@8bytes.org --- arch/x86/kernel/machine_kexec_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d1ab07ec8c9a..5409c2800ab5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -56,7 +56,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_page((unsigned long)image->arch.pgd); + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { - image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); From fdf82a7856b32d905c39afc85e34364491e46346 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 26 Jul 2018 13:14:55 +0200 Subject: [PATCH 49/60] x86/speculation: Protect against userspace-userspace spectreRSB The article "Spectre Returns! Speculation Attacks using the Return Stack Buffer" [1] describes two new (sub-)variants of spectrev2-like attacks, making use solely of the RSB contents even on CPUs that don't fallback to BTB on RSB underflow (Skylake+). Mitigate userspace-userspace attacks by always unconditionally filling RSB on context switch when the generic spectrev2 mitigation has been enabled. [1] https://arxiv.org/pdf/1807.07940.pdf Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Borislav Petkov Cc: David Woodhouse Cc: Peter Zijlstra Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/nycvar.YFH.7.76.1807261308190.997@cbobk.fhfr.pm --- arch/x86/kernel/cpu/bugs.c | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5c0ea39311fe..bc8c43b22460 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -313,23 +313,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -/* Check for Skylake-like CPUs (for RSB handling) */ -static bool __init is_skylake_era(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - return true; - } - } - return false; -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -390,22 +373,15 @@ retpoline_auto: pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP nor PTI are available, there is a risk of - * hitting userspace addresses in the RSB after a context switch - * from a shallow call stack to a deeper one. To prevent this fill - * the entire RSB, even when using IBRS. + * If spectre v2 protection has been enabled, unconditionally fill + * RSB during a context switch; this protects against two independent + * issues: * - * Skylake era CPUs have a separate issue with *underflow* of the - * RSB, when they will predict 'ret' targets from the generic BTB. - * The proper mitigation for this is IBRS. If IBRS is not supported - * or deactivated in favour of retpolines the RSB fill on context - * switch is required. + * - RSB underflow (and switch to BTB) on Skylake+ + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs */ - if ((!boot_cpu_has(X86_FEATURE_PTI) && - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); - } + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { From 706d51681d636a0c4a5ef53395ec3b803e45ed4d Mon Sep 17 00:00:00 2001 From: Sai Praneeth Date: Wed, 1 Aug 2018 11:42:25 -0700 Subject: [PATCH 50/60] x86/speculation: Support Enhanced IBRS on future CPUs Future Intel processors will support "Enhanced IBRS" which is an "always on" mode i.e. IBRS bit in SPEC_CTRL MSR is enabled once and never disabled. From the specification [1]: "With enhanced IBRS, the predicted targets of indirect branches executed cannot be controlled by software that was executed in a less privileged predictor mode or on another logical processor. As a result, software operating on a processor with enhanced IBRS need not use WRMSR to set IA32_SPEC_CTRL.IBRS after every transition to a more privileged predictor mode. Software can isolate predictor modes effectively simply by setting the bit once. Software need not disable enhanced IBRS prior to entering a sleep state such as MWAIT or HLT." If Enhanced IBRS is supported by the processor then use it as the preferred spectre v2 mitigation mechanism instead of Retpoline. Intel's Retpoline white paper [2] states: "Retpoline is known to be an effective branch target injection (Spectre variant 2) mitigation on Intel processors belonging to family 6 (enumerated by the CPUID instruction) that do not have support for enhanced IBRS. On processors that support enhanced IBRS, it should be used for mitigation instead of retpoline." The reason why Enhanced IBRS is the recommended mitigation on processors which support it is that these processors also support CET which provides a defense against ROP attacks. Retpoline is very similar to ROP techniques and might trigger false positives in the CET defense. If Enhanced IBRS is selected as the mitigation technique for spectre v2, the IBRS bit in SPEC_CTRL MSR is set once at boot time and never cleared. Kernel also has to make sure that IBRS bit remains set after VMEXIT because the guest might have cleared the bit. This is already covered by the existing x86_spec_ctrl_set_guest() and x86_spec_ctrl_restore_host() speculation control functions. Enhanced IBRS still requires IBPB for full mitigation. [1] Speculative-Execution-Side-Channel-Mitigations.pdf [2] Retpoline-A-Branch-Target-Injection-Mitigation.pdf Both documents are available at: https://bugzilla.kernel.org/show_bug.cgi?id=199511 Originally-by: David Woodhouse Signed-off-by: Sai Praneeth Prakhya Signed-off-by: Thomas Gleixner Cc: Tim C Chen Cc: Dave Hansen Cc: Ravi Shankar Link: https://lkml.kernel.org/r/1533148945-24095-1-git-send-email-sai.praneeth.prakhya@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 20 ++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 3 +++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5701f5cecd31..2687cd8e8d58 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,6 +219,7 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c99082e2ef13..fd2a8c1b88bc 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -214,6 +214,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS_ENHANCED, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bc8c43b22460..405a9a61bb89 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -130,6 +130,7 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt @@ -332,6 +333,13 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_IBRS_ENHANCED; + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + goto specv2_set_mode; + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -369,6 +377,7 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } +specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -391,9 +400,16 @@ retpoline_auto: /* * Retpoline means the kernel is safe because it has no indirect - * branches. But firmware isn't, so use IBRS to protect that. + * branches. Enhanced IBRS protects firmware too, so, enable restricted + * speculation around firmware calls only when Enhanced IBRS isn't + * supported. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 43a927eb9c09..df28e931d732 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1005,6 +1005,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (x86_match_cpu(cpu_no_meltdown)) return; From eac7073aa69aa1cac819aa712146284f53f642b1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Aug 2018 15:58:25 -0700 Subject: [PATCH 51/60] x86/mm/pti: Clear Global bit more aggressively The kernel image starts out with the Global bit set across the entire kernel image. The bit is cleared with set_memory_nonglobal() in the configurations with PCIDs where the performance benefits of the Global bit are not needed. However, this is fragile. It means that we are stuck opting *out* of the less-secure (Global bit set) configuration, which seems backwards. Let's start more secure (Global bit clear) and then let things opt back in if they want performance, or are truly mapping common data between kernel and userspace. This fixes a bug. Before this patch, there are areas that are unmapped from the user page tables (like like everything above 0xffffffff82600000 in the example below). These have the hallmark of being a wrong Global area: they are not identical in the 'current_kernel' and 'current_user' page table dumps. They are also read-write, which means they're much more likely to contain secrets. Before this patch: current_kernel:---[ High Kernel Mapping ]--- current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K RW GLB NX pte current_kernel-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE GLB NX pmd current_kernel-0xffffffff82c00000-0xffffffff82e00000 2M RW GLB NX pte current_kernel-0xffffffff82e00000-0xffffffff83200000 4M RW PSE GLB NX pmd current_kernel-0xffffffff83200000-0xffffffffa0000000 462M pmd current_user:---[ High Kernel Mapping ]--- current_user-0xffffffff80000000-0xffffffff81000000 16M pmd current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_user-0xffffffff81e11000-0xffffffff82000000 1980K RW GLB NX pte current_user-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd After this patch: current_kernel:---[ High Kernel Mapping ]--- current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte current_kernel-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE NX pmd current_kernel-0xffffffff82c00000-0xffffffff82e00000 2M RW NX pte current_kernel-0xffffffff82e00000-0xffffffff83200000 4M RW PSE NX pmd current_kernel-0xffffffff83200000-0xffffffffa0000000 462M pmd current_user:---[ High Kernel Mapping ]--- current_user-0xffffffff80000000-0xffffffff81000000 16M pmd current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_user-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte current_user-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd Fixes: 0f561fce4d69 ("x86/pti: Enable global pages for shared areas") Reported-by: Hugh Dickins Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: keescook@google.com Cc: aarcange@redhat.com Cc: jgross@suse.com Cc: jpoimboe@redhat.com Cc: gregkh@linuxfoundation.org Cc: peterz@infradead.org Cc: torvalds@linux-foundation.org Cc: bp@alien8.de Cc: luto@kernel.org Cc: ak@linux.intel.com Cc: Kees Cook Cc: Andrea Arcangeli Cc: Juergen Gross Cc: Josh Poimboeuf Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andi Kleen Link: https://lkml.kernel.org/r/20180802225825.A100C071@viggo.jf.intel.com --- arch/x86/mm/pageattr.c | 6 ++++++ arch/x86/mm/pti.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3bded76e8d5c..c04153796f61 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1784,6 +1784,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4d418e705878..8d88d067b3d7 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -434,6 +434,13 @@ static inline bool pti_kernel_image_global_ok(void) return true; } +/* + * This is the only user for these and it is not arch-generic + * like the other set_memory.h functions. Just extern them. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +extern int set_memory_global(unsigned long addr, int numpages); + /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. @@ -446,7 +453,8 @@ void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long end_clone = (unsigned long)__end_rodata_hpage_align; + unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); if (!pti_kernel_image_global_ok()) return; @@ -458,14 +466,18 @@ void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end, _PAGE_RW); + pti_clone_pmds(start, end_clone, _PAGE_RW); + + /* + * pti_clone_pmds() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } -/* - * This is the only user for it and it is not arch-generic like - * the other set_memory.h functions. Just extern it. - */ -extern int set_memory_nonglobal(unsigned long addr, int numpages); void pti_set_kernel_image_nonglobal(void) { /* @@ -477,9 +489,11 @@ void pti_set_kernel_image_nonglobal(void) unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); - if (pti_kernel_image_global_ok()) - return; - + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } From 0d83432811f26871295a9bc24d3c387924da6071 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Aug 2018 15:58:26 -0700 Subject: [PATCH 52/60] mm: Allow non-direct-map arguments to free_reserved_area() free_reserved_area() takes pointers as arguments to show which addresses should be freed. However, it does this in a somewhat ambiguous way. If it gets a kernel direct map address, it always works. However, if it gets an address that is part of the kernel image alias mapping, it can fail. It fails if all of the following happen: * The specified address is part of the kernel image alias * Poisoning is requested (forcing a memset()) * The address is in a read-only portion of the kernel image The memset() fails on the read-only mapping, of course. free_reserved_area() *is* called both on the direct map and on kernel image alias addresses. We've just lucked out thus far that the kernel image alias areas it gets used on are read-write. I'm fairly sure this has been just a happy accident. It is quite easy to make free_reserved_area() work for all cases: just convert the address to a direct map address before doing the memset(), and do this unconditionally. There is little chance of a regression here because we previously did a virt_to_page() on the address for the memset, so we know these are not highmem pages for which virt_to_page() would fail. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: keescook@google.com Cc: aarcange@redhat.com Cc: jgross@suse.com Cc: jpoimboe@redhat.com Cc: gregkh@linuxfoundation.org Cc: peterz@infradead.org Cc: hughd@google.com Cc: torvalds@linux-foundation.org Cc: bp@alien8.de Cc: luto@kernel.org Cc: ak@linux.intel.com Cc: Kees Cook Cc: Andrea Arcangeli Cc: Juergen Gross Cc: Josh Poimboeuf Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Linus Torvalds Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andi Kleen Link: https://lkml.kernel.org/r/20180802225826.1287AE3E@viggo.jf.intel.com --- mm/page_alloc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a790ef4be74e..3222193c46c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6939,9 +6939,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); if ((unsigned int)poison <= 0xFF) - memset(pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); } if (pages && s) From 9f515cdb411ef34f1aaf4c40bb0c932cf6db5de1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Aug 2018 15:58:28 -0700 Subject: [PATCH 53/60] x86/mm/init: Pass unconverted symbol addresses to free_init_pages() The x86 code has several places where it frees parts of kernel image: 1. Unused SMP alternative 2. __init code 3. The hole between text and rodata 4. The hole between rodata and data We call free_init_pages() to do this. Strangely, we convert the symbol addresses to kernel direct map addresses in some cases (#3, #4) but not others (#1, #2). The virt_to_page() and the other code in free_reserved_area() now works fine for for symbol addresses on x86, so don't bother converting the addresses to direct map addresses before freeing them. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: keescook@google.com Cc: aarcange@redhat.com Cc: jgross@suse.com Cc: jpoimboe@redhat.com Cc: gregkh@linuxfoundation.org Cc: peterz@infradead.org Cc: hughd@google.com Cc: torvalds@linux-foundation.org Cc: bp@alien8.de Cc: luto@kernel.org Cc: ak@linux.intel.com Cc: Kees Cook Cc: Andrea Arcangeli Cc: Juergen Gross Cc: Josh Poimboeuf Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Linus Torvalds Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andi Kleen Link: https://lkml.kernel.org/r/20180802225828.89B2D0E2@viggo.jf.intel.com --- arch/x86/mm/init_64.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a688617c727e..73158655c4a7 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1283,12 +1283,8 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(text_end)), - (unsigned long) __va(__pa_symbol(rodata_start))); - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(rodata_end)), - (unsigned long) __va(__pa_symbol(_sdata))); + free_init_pages("unused kernel", text_end, rodata_start); + free_init_pages("unused kernel", rodata_end, _sdata); debug_checkwx(); From 6ea2738e0ca0e626c75202fb051c1e88d7a950fa Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Aug 2018 15:58:29 -0700 Subject: [PATCH 54/60] x86/mm/init: Add helper for freeing kernel image pages When chunks of the kernel image are freed, free_init_pages() is used directly. Consolidate the three sites that do this. Also update the string to give an incrementally better description of that memory versus what was there before. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: keescook@google.com Cc: aarcange@redhat.com Cc: jgross@suse.com Cc: jpoimboe@redhat.com Cc: gregkh@linuxfoundation.org Cc: peterz@infradead.org Cc: hughd@google.com Cc: torvalds@linux-foundation.org Cc: bp@alien8.de Cc: luto@kernel.org Cc: ak@linux.intel.com Cc: Kees Cook Cc: Andrea Arcangeli Cc: Juergen Gross Cc: Josh Poimboeuf Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Linus Torvalds Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andi Kleen Link: https://lkml.kernel.org/r/20180802225829.FE0E32EA@viggo.jf.intel.com --- arch/x86/include/asm/processor.h | 1 + arch/x86/mm/init.c | 15 ++++++++++++--- arch/x86/mm/init_64.c | 4 ++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cfd29ee8c3da..59663c08c949 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -966,6 +966,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); +extern void free_kernel_image_pages(void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cee58a972cb2..bc11dedffc45 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -773,13 +773,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(void *begin, void *end) +{ + free_init_pages("unused kernel image", + (unsigned long)begin, (unsigned long)end); +} + void __ref free_initmem(void) { e820__reallocate_tables(); - free_init_pages("unused kernel", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + free_kernel_image_pages(&__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 73158655c4a7..68c292cb1ebf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1283,8 +1283,8 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel", text_end, rodata_start); - free_init_pages("unused kernel", rodata_end, _sdata); + free_kernel_image_pages((void *)text_end, (void *)rodata_start); + free_kernel_image_pages((void *)rodata_end, (void *)_sdata); debug_checkwx(); From c40a56a7818cfe735fc93a69e1875f8bba834483 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Aug 2018 15:58:31 -0700 Subject: [PATCH 55/60] x86/mm/init: Remove freed kernel image areas from alias mapping The kernel image is mapped into two places in the virtual address space (addresses without KASLR, of course): 1. The kernel direct map (0xffff880000000000) 2. The "high kernel map" (0xffffffff81000000) We actually execute out of #2. If we get the address of a kernel symbol, it points to #2, but almost all physical-to-virtual translations point to Parts of the "high kernel map" alias are mapped in the userspace page tables with the Global bit for performance reasons. The parts that we map to userspace do not (er, should not) have secrets. When PTI is enabled then the global bit is usually not set in the high mapping and just used to compensate for poor performance on systems which lack PCID. This is fine, except that some areas in the kernel image that are adjacent to the non-secret-containing areas are unused holes. We free these holes back into the normal page allocator and reuse them as normal kernel memory. The memory will, of course, get *used* via the normal map, but the alias mapping is kept. This otherwise unused alias mapping of the holes will, by default keep the Global bit, be mapped out to userspace, and be vulnerable to Meltdown. Remove the alias mapping of these pages entirely. This is likely to fracture the 2M page mapping the kernel image near these areas, but this should affect a minority of the area. The pageattr code changes *all* aliases mapping the physical pages that it operates on (by default). We only want to modify a single alias, so we need to tweak its behavior. This unmapping behavior is currently dependent on PTI being in place. Going forward, we should at least consider doing this for all configurations. Having an extra read-write alias for memory is not exactly ideal for debugging things like random memory corruption and this does undercut features like DEBUG_PAGEALLOC or future work like eXclusive Page Frame Ownership (XPFO). Before this patch: current_kernel:---[ High Kernel Mapping ]--- current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte current_kernel-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE NX pmd current_kernel-0xffffffff82c00000-0xffffffff82e00000 2M RW NX pte current_kernel-0xffffffff82e00000-0xffffffff83200000 4M RW PSE NX pmd current_kernel-0xffffffff83200000-0xffffffffa0000000 462M pmd current_user:---[ High Kernel Mapping ]--- current_user-0xffffffff80000000-0xffffffff81000000 16M pmd current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_user-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte current_user-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd After this patch: current_kernel:---[ High Kernel Mapping ]--- current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K pte current_kernel-0xffffffff82000000-0xffffffff82400000 4M ro PSE GLB NX pmd current_kernel-0xffffffff82400000-0xffffffff82488000 544K ro NX pte current_kernel-0xffffffff82488000-0xffffffff82600000 1504K pte current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE NX pmd current_kernel-0xffffffff82c00000-0xffffffff82c0d000 52K RW NX pte current_kernel-0xffffffff82c0d000-0xffffffff82dc0000 1740K pte current_user:---[ High Kernel Mapping ]--- current_user-0xffffffff80000000-0xffffffff81000000 16M pmd current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte current_user-0xffffffff81e11000-0xffffffff82000000 1980K pte current_user-0xffffffff82000000-0xffffffff82400000 4M ro PSE GLB NX pmd current_user-0xffffffff82400000-0xffffffff82488000 544K ro NX pte current_user-0xffffffff82488000-0xffffffff82600000 1504K pte current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd [ tglx: Do not unmap on 32bit as there is only one mapping ] Fixes: 0f561fce4d69 ("x86/pti: Enable global pages for shared areas") Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Kees Cook Cc: Andrea Arcangeli Cc: Juergen Gross Cc: Josh Poimboeuf Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Linus Torvalds Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andi Kleen Cc: Joerg Roedel Link: https://lkml.kernel.org/r/20180802225831.5F6A2BFC@viggo.jf.intel.com --- arch/x86/include/asm/set_memory.h | 1 + arch/x86/mm/init.c | 26 ++++++++++++++++++++++++-- arch/x86/mm/pageattr.c | 13 +++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index bd090367236c..34cffcef7375 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc11dedffc45..74b157ac078d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -780,8 +780,30 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ void free_kernel_image_pages(void *begin, void *end) { - free_init_pages("unused kernel image", - (unsigned long)begin, (unsigned long)end); + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + + free_init_pages("unused kernel image", begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); } void __ref free_initmem(void) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c04153796f61..0a74996a1149 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + /* Has caller explicitly disabled alias checking? */ + if (in_flag & CPA_NO_CHECK_ALIAS) + checkalias = 0; ret = __change_page_attr_set_clr(&cpa, checkalias); @@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + int cpa_flags = CPA_NO_CHECK_ALIAS; + + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), From 88c6f8a3977cc35997b47e2f99f080a15559c1eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 7 Aug 2018 12:24:29 +0200 Subject: [PATCH 56/60] x86/mm/pti: Fix 32 bit PCID check The check uses the wrong operator and causes false positive warnings in the kernel log on some systems. Fixes: 5e8105950a8b3 ('x86/mm/pti: Add Warning when booting on a PCID capable CPU') Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1533637471-30953-2-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ef8db6ffc836..113ba14a03d8 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -549,7 +549,7 @@ void __init pti_init(void) * supported on 32 bit anyway. To print the warning we need to * check with cpuid directly again. */ - if (cpuid_ecx(0x1) && BIT(17)) { + if (cpuid_ecx(0x1) & BIT(17)) { /* Use printk to work around pr_fmt() */ printk(KERN_WARNING "\n"); printk(KERN_WARNING "************************************************************\n"); From 30514effc9206d4e084ec32239ae221db157d43a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 7 Aug 2018 12:24:30 +0200 Subject: [PATCH 57/60] x86/mm/pti: Don't clear permissions in pti_clone_pmd() The function sets the global-bit on cloned PMD entries, which only makes sense when the permissions are identical between the user and the kernel page-table. Further, only write-permissions are cleared for entry-text and kernel-text sections, which are not writeable at the end of the boot process. The reason why this RW clearing exists is that in the early PTI implementations the cloned kernel areas were set up during early boot before the kernel text is set to read only and not touched afterwards. This is not longer true. The cloned areas are still set up early to get the entry code working for interrupts and other things, but after the kernel text has been set RO the clone is repeated which copies the RO PMD/PTEs over to the user visible clone. That means the initial clearing of the writable bit can be avoided. [ tglx: Amended changelog ] Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1533637471-30953-3-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 113ba14a03d8..5164c987b1f1 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -291,7 +291,7 @@ static void __init pti_setup_vsyscall(void) { } #endif static void -pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +pti_clone_pmds(unsigned long start, unsigned long end) { unsigned long addr; @@ -352,7 +352,7 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) * tables will share the last-level page tables of this * address range */ - *target_pmd = pmd_clear_flags(*pmd, clear); + *target_pmd = *pmd; } } @@ -398,7 +398,7 @@ static void __init pti_clone_user_shared(void) start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); - pti_clone_pmds(start, end, 0); + pti_clone_pmds(start, end); } #endif /* CONFIG_X86_64 */ @@ -418,8 +418,7 @@ static void __init pti_setup_espfix64(void) static void pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, - _PAGE_RW); + (unsigned long) __irqentry_text_end); } /* @@ -501,7 +500,7 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end_clone, _PAGE_RW); + pti_clone_pmds(start, end_clone); /* * pti_clone_pmds() will set the global bit in any PMDs From 16a3fe634f6a568c6234b8747e5d50487fed3526 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 7 Aug 2018 12:24:31 +0200 Subject: [PATCH 58/60] x86/mm/pti: Clone kernel-image on PTE level for 32 bit On 32 bit the kernel sections are not huge-page aligned. When we clone them on PMD-level we unevitably map some areas that are normal kernel memory and may contain secrets to user-space. To prevent that we need to clone the kernel-image on PTE-level for 32 bit. Also make the page-table cloning code more general so that it can handle PMD and PTE level cloning. This can be generalized further in the future to also handle clones on the P4D-level. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1533637471-30953-4-git-send-email-joro@8bytes.org --- arch/x86/mm/pti.c | 134 +++++++++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 38 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5164c987b1f1..1dc5c683e7a5 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -54,6 +54,16 @@ #define __GFP_NOTRACK 0 #endif +/* + * Define the page-table levels we clone for user-space on 32 + * and 64 bit. + */ +#ifdef CONFIG_X86_64 +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD +#else +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE +#endif + static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) @@ -228,7 +238,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) return pmd_offset(pud, address); } -#ifdef CONFIG_X86_VSYSCALL_EMULATION /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. @@ -270,6 +279,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) return pte; } +#ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; @@ -290,8 +300,14 @@ static void __init pti_setup_vsyscall(void) static void __init pti_setup_vsyscall(void) { } #endif +enum pti_clone_level { + PTI_CLONE_PMD, + PTI_CLONE_PTE, +}; + static void -pti_clone_pmds(unsigned long start, unsigned long end) +pti_clone_pgtable(unsigned long start, unsigned long end, + enum pti_clone_level level) { unsigned long addr; @@ -299,7 +315,8 @@ pti_clone_pmds(unsigned long start, unsigned long end) * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ - for (addr = start; addr < end; addr += PMD_SIZE) { + for (addr = start; addr < end;) { + pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; @@ -315,44 +332,84 @@ pti_clone_pmds(unsigned long start, unsigned long end) p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; + pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (pud_none(*pud)) { + addr += PUD_SIZE; continue; + } + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (pmd_none(*pmd)) { + addr += PMD_SIZE; continue; + } - target_pmd = pti_user_pagetable_walk_pmd(addr); - if (WARN_ON(!target_pmd)) - return; + if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; - /* - * Only clone present PMDs. This ensures only setting - * _PAGE_GLOBAL on present PMDs. This should only be - * called on well-known addresses anyway, so a non- - * present PMD would be a surprise. - */ - if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) - return; + /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; - /* - * Setting 'target_pmd' below creates a mapping in both - * the user and kernel page tables. It is effectively - * global, so set it as global in both copies. Note: - * the X86_FEATURE_PGE check is not _required_ because - * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with - * code that only set this bit when supported. - */ - if (boot_cpu_has(X86_FEATURE_PGE)) - *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); - /* - * Copy the PMD. That is, the kernelmode and usermode - * tables will share the last-level page tables of this - * address range - */ - *target_pmd = *pmd; + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = *pmd; + + addr += PMD_SIZE; + + } else if (level == PTI_CLONE_PTE) { + + /* Walk the page-table down to the pte level */ + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + addr += PAGE_SIZE; + continue; + } + + /* Only clone present PTEs */ + if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) + return; + + /* Allocate PTE in the user page-table */ + target_pte = pti_user_pagetable_walk_pte(addr); + if (WARN_ON(!target_pte)) + return; + + /* Set GLOBAL bit in both PTEs */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pte = pte_set_flags(*pte, _PAGE_GLOBAL); + + /* Clone the PTE */ + *target_pte = *pte; + + addr += PAGE_SIZE; + + } else { + BUG(); + } } } @@ -398,7 +455,7 @@ static void __init pti_clone_user_shared(void) start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); - pti_clone_pmds(start, end); + pti_clone_pgtable(start, end, PTI_CLONE_PMD); } #endif /* CONFIG_X86_64 */ @@ -417,8 +474,9 @@ static void __init pti_setup_espfix64(void) */ static void pti_clone_entry_text(void) { - pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end); + pti_clone_pgtable((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, + PTI_CLONE_PMD); } /* @@ -500,10 +558,10 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end_clone); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); /* - * pti_clone_pmds() will set the global bit in any PMDs + * pti_clone_pgtable() will set the global bit in any PMDs * that it clones, but we also need to get any PTEs in * the last level for areas that are not huge-page-aligned. */ From a29dba161ad1a01bbfbc80aa184b089ddd169a4e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Aug 2018 11:44:49 +0200 Subject: [PATCH 59/60] x86/relocs: Add __end_rodata_aligned to S_REL This new symbol needs to be in the workaround-list for buggy binutils, otherwise the build with gcc-4.6 fails. Fixes: 39d668e04eda ('x86/mm/pti: Make pti_clone_kernel_text() compile on 32 bit') Reported-by: Stephen Rothwell Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Sedat Dilek Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Linux-Next Mailing List Link: https://lkml.kernel.org/r/20180809094449.ddmnrkz7qkvo3j2x@suse.de --- arch/x86/tools/relocs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 220e97841e49..3a6c8ebc8032 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__tracedata_(start|end)|" "__(start|stop)_notes|" "__end_rodata|" + "__end_rodata_aligned|" "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 From d878efce73fe86db34ddb2013260adf571a701a7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 8 Aug 2018 13:16:40 +0200 Subject: [PATCH 60/60] x86/mm/pti: Move user W+X check into pti_finalize() The user page-table gets the updated kernel mappings in pti_finalize(), which runs after the RO+X permissions got applied to the kernel page-table in mark_readonly(). But with CONFIG_DEBUG_WX enabled, the user page-table is already checked in mark_readonly() for insecure mappings. This causes false-positive warnings, because the user page-table did not get the updated mappings yet. Move the W+X check for the user page-table into pti_finalize() after it updated all required mappings. [ tglx: Folded !NX supported fix ] Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1533727000-9172-1-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable.h | 7 +++++-- arch/x86/mm/dump_pagetables.c | 6 +++--- arch/x86/mm/pti.c | 2 ++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e39088cb59ab..a1cb3339da8d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); +void ptdump_walk_user_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) +#define debug_checkwx() do { } while (0) +#define debug_checkwx_user() do { } while (0) #endif /* diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ccd92c4da57c..a12afff146d1 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -569,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) } EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); -static void ptdump_walk_user_pgd_level_checkwx(void) +void ptdump_walk_user_pgd_level_checkwx(void) { #ifdef CONFIG_PAGE_TABLE_ISOLATION pgd_t *pgd = INIT_PGD; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!(__supported_pte_mask & _PAGE_NX) || + !static_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); @@ -586,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { ptdump_walk_pgd_level_core(NULL, NULL, true, false); - ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 1dc5c683e7a5..d58b4aba9510 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -646,4 +646,6 @@ void pti_finalize(void) */ pti_clone_entry_text(); pti_clone_kernel_text(); + + debug_checkwx_user(); }