diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c371bfee137a..2767c625a52c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -65,7 +65,7 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all +# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -77,6 +77,8 @@ #endif .endm +#define PTI_SWITCH_MASK (1 << PAGE_SHIFT) + /* * User gs save/restore * @@ -154,7 +156,52 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL pt_regs_ax=%eax +/* Unconditionally switch to user cr3 */ +.macro SWITCH_TO_USER_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + movl %cr3, \scratch_reg + orl $PTI_SWITCH_MASK, \scratch_reg + movl \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro BUG_IF_WRONG_CR3 no_user_check=0 +#ifdef CONFIG_DEBUG_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + .if \no_user_check == 0 + /* coming from usermode? */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lend_\@ + .endif + /* On user-cr3? */ + movl %cr3, %eax + testl $PTI_SWITCH_MASK, %eax + jnz .Lend_\@ + /* From userspace with kernel cr3 - BUG */ + ud2 +.Lend_\@: +#endif +.endm + +/* + * Switch to kernel cr3 if not already loaded and return current cr3 in + * \scratch_reg + */ +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + movl %cr3, \scratch_reg + /* Test if we are already on kernel CR3 */ + testl $PTI_SWITCH_MASK, \scratch_reg + jz .Lend_\@ + andl $(~PTI_SWITCH_MASK), \scratch_reg + movl \scratch_reg, %cr3 + /* Return original CR3 in \scratch_reg */ + orl $PTI_SWITCH_MASK, \scratch_reg +.Lend_\@: +.endm + +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS pushl %fs @@ -173,6 +220,29 @@ movl $(__KERNEL_PERCPU), %edx movl %edx, %fs SET_KERNEL_GS %edx + + /* Switch to kernel stack if necessary */ +.if \switch_stacks > 0 + SWITCH_TO_KERNEL_STACK +.endif + +.endm + +.macro SAVE_ALL_NMI cr3_reg:req + SAVE_ALL + + BUG_IF_WRONG_CR3 + + /* + * Now switch the CR3 when PTI is enabled. + * + * We can enter with either user or kernel cr3, the code will + * store the old cr3 in \cr3_reg and switches to the kernel cr3 + * if necessary. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg + +.Lend_\@: .endm /* @@ -221,6 +291,349 @@ POP_GS_EX .endm +.macro RESTORE_ALL_NMI cr3_reg:req pop=0 + /* + * Now switch the CR3 when PTI is enabled. + * + * We enter with kernel cr3 and switch the cr3 to the value + * stored on \cr3_reg, which is either a user or a kernel cr3. + */ + ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI + + testl $PTI_SWITCH_MASK, \cr3_reg + jz .Lswitched_\@ + + /* User cr3 in \cr3_reg - write it to hardware cr3 */ + movl \cr3_reg, %cr3 + +.Lswitched_\@: + + BUG_IF_WRONG_CR3 + + RESTORE_REGS pop=\pop +.endm + +.macro CHECK_AND_APPLY_ESPFIX +#ifdef CONFIG_X86_ESPFIX32 +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + + ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX + + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + jne .Lend_\@ # returning to user-space with LDT SS + + /* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + lss (%esp), %esp /* switch to espfix segment */ +.Lend_\@: +#endif /* CONFIG_X86_ESPFIX32 */ +.endm + +/* + * Called with pt_regs fully populated and kernel segments loaded, + * so we can access PER_CPU and use the integer registers. + * + * We need to be very careful here with the %esp switch, because an NMI + * can happen everywhere. If the NMI handler finds itself on the + * entry-stack, it will overwrite the task-stack and everything we + * copied there. So allocate the stack-frame on the task-stack and + * switch to it before we do any copying. + */ + +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) + +.macro SWITCH_TO_KERNEL_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + BUG_IF_WRONG_CR3 + + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + + /* + * %eax now contains the entry cr3 and we carry it forward in + * that register for the time this macro runs + */ + + /* Are we on the entry stack? Bail out if not! */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jae .Lend_\@ + + /* Load stack pointer into %esi and %edi */ + movl %esp, %esi + movl %esi, %edi + + /* Move %edi to the top of the entry stack */ + andl $(MASK_entry_stack), %edi + addl $(SIZEOF_entry_stack), %edi + + /* Load top of task-stack into %edi */ + movl TSS_entry2task_stack(%edi), %edi + + /* + * Clear unused upper bits of the dword containing the word-sized CS + * slot in pt_regs in case hardware didn't clear it for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + + /* Special case - entry from kernel mode via entry stack */ +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS + movb PT_CS(%esp), %cl + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx +#else + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx +#endif + cmpl $USER_RPL, %ecx + jb .Lentry_from_kernel_\@ + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) + jz .Lcopy_pt_regs_\@ + + /* + * Stack-frame contains 4 additional segment registers when + * coming from VM86 mode + */ + addl $(4 * 4), %ecx + +#endif +.Lcopy_pt_regs_\@: + + /* Allocate frame on task-stack */ + subl %ecx, %edi + + /* Switch to task-stack */ + movl %edi, %esp + + /* + * We are now on the task-stack and can safely copy over the + * stack-frame + */ + shrl $2, %ecx + cld + rep movsl + + jmp .Lend_\@ + +.Lentry_from_kernel_\@: + + /* + * This handles the case when we enter the kernel from + * kernel-mode and %esp points to the entry-stack. When this + * happens we need to switch to the task-stack to run C code, + * but switch back to the entry-stack again when we approach + * iret and return to the interrupted code-path. This usually + * happens when we hit an exception while restoring user-space + * segment registers on the way back to user-space or when the + * sysenter handler runs with eflags.tf set. + * + * When we switch to the task-stack here, we can't trust the + * contents of the entry-stack anymore, as the exception handler + * might be scheduled out or moved to another CPU. Therefore we + * copy the complete entry-stack to the task-stack and set a + * marker in the iret-frame (bit 31 of the CS dword) to detect + * what we've done on the iret path. + * + * On the iret path we copy everything back and switch to the + * entry-stack, so that the interrupted kernel code-path + * continues on the same stack it was interrupted with. + * + * Be aware that an NMI can happen anytime in this code. + * + * %esi: Entry-Stack pointer (same as %esp) + * %edi: Top of the task stack + * %eax: CR3 on kernel entry + */ + + /* Calculate number of bytes on the entry stack in %ecx */ + movl %esi, %ecx + + /* %ecx to the top of entry-stack */ + andl $(MASK_entry_stack), %ecx + addl $(SIZEOF_entry_stack), %ecx + + /* Number of bytes on the entry stack to %ecx */ + sub %esi, %ecx + + /* Mark stackframe as coming from entry stack */ + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + + /* + * Test the cr3 used to enter the kernel and add a marker + * so that we can switch back to it before iret. + */ + testl $PTI_SWITCH_MASK, %eax + jz .Lcopy_pt_regs_\@ + orl $CS_FROM_USER_CR3, PT_CS(%esp) + + /* + * %esi and %edi are unchanged, %ecx contains the number of + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate + * the stack-frame on task-stack and copy everything over + */ + jmp .Lcopy_pt_regs_\@ + +.Lend_\@: +.endm + +/* + * Switch back from the kernel stack to the entry stack. + * + * The %esp register must point to pt_regs on the task stack. It will + * first calculate the size of the stack-frame to copy, depending on + * whether we return to VM86 mode or not. With that it uses 'rep movsl' + * to copy the contents of the stack over to the entry stack. + * + * We must be very careful here, as we can't trust the contents of the + * task-stack once we switched to the entry-stack. When an NMI happens + * while on the entry-stack, the NMI handler will switch back to the top + * of the task stack, overwriting our stack-frame we are about to copy. + * Therefore we switch the stack only after everything is copied over. + */ +.macro SWITCH_TO_ENTRY_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp) + jz .Lcopy_pt_regs_\@ + + /* Additional 4 registers to copy when returning to VM86 mode */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Initialize source and destination for movsl */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + subl %ecx, %edi + movl %esp, %esi + + /* Save future stack pointer in %ebx */ + movl %edi, %ebx + + /* Copy over the stack-frame */ + shrl $2, %ecx + cld + rep movsl + + /* + * Switch to entry-stack - needs to happen after everything is + * copied because the NMI handler will overwrite the task-stack + * when on entry-stack + */ + movl %ebx, %esp + +.Lend_\@: +.endm + +/* + * This macro handles the case when we return to kernel-mode on the iret + * path and have to switch back to the entry stack and/or user-cr3 + * + * See the comments below the .Lentry_from_kernel_\@ label in the + * SWITCH_TO_KERNEL_STACK macro for more details. + */ +.macro PARANOID_EXIT_TO_KERNEL_MODE + + /* + * Test if we entered the kernel with the entry-stack. Most + * likely we did not, because this code only runs on the + * return-to-kernel path. + */ + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + jz .Lend_\@ + + /* Unlikely slow-path */ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) + + /* Copy the remaining task-stack contents to entry-stack */ + movl %esp, %esi + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + + /* Bytes on the task-stack to ecx */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx + subl %esi, %ecx + + /* Allocate stack-frame on entry-stack */ + subl %ecx, %edi + + /* + * Save future stack-pointer, we must not switch until the + * copy is done, otherwise the NMI handler could destroy the + * contents of the task-stack we are about to copy. + */ + movl %edi, %ebx + + /* Do the copy */ + shrl $2, %ecx + cld + rep movsl + + /* Safe to switch to entry-stack now */ + movl %ebx, %esp + + /* + * We came from entry-stack and need to check if we also need to + * switch back to user cr3. + */ + testl $CS_FROM_USER_CR3, PT_CS(%esp) + jz .Lend_\@ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_USER_CR3), PT_CS(%esp) + + SWITCH_TO_USER_CR3 scratch_reg=%eax + +.Lend_\@: +.endm /* * %eax: prev task * %edx: next task @@ -351,9 +764,9 @@ ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) .Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all + jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_all_kernel call preempt_schedule_irq jmp .Lneed_resched END(resume_kernel) @@ -412,7 +825,21 @@ ENTRY(xen_sysenter_target) * 0(%ebp) arg6 */ ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp), %esp + /* + * On entry-stack with all userspace-regs live - save and + * restore eflags and %eax to use it as scratch-reg for the cr3 + * switch. + */ + pushfl + pushl %eax + BUG_IF_WRONG_CR3 no_user_check=1 + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + popl %eax + popfl + + /* Stack empty again, switch to task stack */ + movl TSS_entry2task_stack(%esp), %esp + .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ @@ -421,7 +848,7 @@ ENTRY(entry_SYSENTER_32) pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ /* * SYSENTER doesn't filter flags, so we need to clear NT, AC @@ -460,25 +887,49 @@ ENTRY(entry_SYSENTER_32) /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ + + /* + * Setup entry stack - we keep the pointer in %eax and do the + * switch after almost all user-state is restored. + */ + + /* Load entry stack pointer and allocate frame for eflags/eax */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax + subl $(2*4), %eax + + /* Copy eflags and eax to entry stack */ + movl PT_EFLAGS(%esp), %edi + movl PT_EAX(%esp), %esi + movl %edi, (%eax) + movl %esi, 4(%eax) + + /* Restore user registers and segments */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS + popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ popl %esi /* pt_regs->si */ popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ - popl %eax /* pt_regs->ax */ + + /* Switch to entry stack */ + movl %eax, %esp + + /* Now ready to switch the cr3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax /* * Restore all flags except IF. (We restore IF separately because * STI gives a one-instruction window in which we won't be interrupted, * whereas POPF does not.) */ - addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ btrl $X86_EFLAGS_IF_BIT, (%esp) + BUG_IF_WRONG_CR3 no_user_check=1 popfl + popl %eax /* * Return back to the vDSO, which will pop ecx and edx. @@ -532,7 +983,8 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + + SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -546,24 +998,17 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET + SWITCH_TO_ENTRY_STACK .Lrestore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX - - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - /* - * Warning: PT_OLDSS(%esp) contains the wrong/random values if we - * are returning to the kernel. - * See comments in process.c:copy_thread() for details. - */ - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je .Lldt_ss # returning to user-space with LDT SS -#endif + CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code + /* Switch back to user CR3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + + BUG_IF_WRONG_CR3 + + /* Restore user state */ + RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -572,46 +1017,33 @@ restore_all: */ INTERRUPT_RETURN +restore_all_kernel: + TRACE_IRQS_IRET + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error + +#ifdef CONFIG_DEBUG_ENTRY + /* + * The stack-frame here is the one that iret faulted on, so its a + * return-to-user frame. We are on kernel-cr3 because we come here from + * the fixup code. This confuses the CR3 checker, so switch to user-cr3 + * as the checker expects it. + */ + pushl %eax + SWITCH_TO_USER_CR3 scratch_reg=%eax + popl %eax +#endif + jmp common_exception .previous _ASM_EXTABLE(.Lirq_return, iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 -.Lldt_ss: -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* - * Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the IRET: - */ - DISABLE_INTERRUPTS(CLBR_ANY) - lss (%esp), %esp /* switch to espfix segment */ - jmp .Lrestore_nocheck -#endif ENDPROC(entry_INT80_32) .macro FIXUP_ESPFIX_STACK @@ -671,7 +1103,8 @@ END(irq_entries_start) common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax @@ -679,16 +1112,16 @@ common_interrupt: jmp ret_from_intr ENDPROC(common_interrupt) -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL switch_stacks=1; \ + ENCODE_FRAME_POINTER; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) \ @@ -920,16 +1353,20 @@ common_exception: pushl %es pushl %ds pushl %eax + movl $(__USER_DS), %eax + movl %eax, %ds + movl %eax, %es + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs pushl %ebp pushl %edi pushl %esi pushl %edx pushl %ecx pushl %ebx + SWITCH_TO_KERNEL_STACK ENCODE_FRAME_POINTER cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address @@ -937,9 +1374,6 @@ common_exception: movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi @@ -948,40 +1382,12 @@ END(common_exception) ENTRY(debug) /* - * #DB can happen at the first instruction of - * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this - * happens, then we will be running on a very small stack. We - * need to detect this condition and switch to the thread - * stack before calling any C code at all. - * - * If you edit this code, keep in mind that NMIs can happen in here. + * Entry from sysenter is now handled in common_exception */ ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL - ENCODE_FRAME_POINTER - xorl %edx, %edx # error code 0 - movl %esp, %eax # pt_regs pointer - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx - subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ - cmpl $SIZEOF_entry_stack, %ecx - jb .Ldebug_from_sysenter_stack - - TRACE_IRQS_OFF - call do_debug - jmp ret_from_exception - -.Ldebug_from_sysenter_stack: - /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - TRACE_IRQS_OFF - call do_debug - movl %ebx, %esp - jmp ret_from_exception + pushl $do_debug + jmp common_exception END(debug) /* @@ -993,6 +1399,7 @@ END(debug) */ ENTRY(nmi) ASM_CLAC + #ifdef CONFIG_X86_ESPFIX32 pushl %eax movl %ss, %eax @@ -1002,7 +1409,7 @@ ENTRY(nmi) #endif pushl %eax # pt_regs->orig_ax - SAVE_ALL + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1016,7 +1423,7 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp .Lrestore_all_notrace + jmp .Lnmi_return .Lnmi_from_sysenter_stack: /* @@ -1027,7 +1434,11 @@ ENTRY(nmi) movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi movl %ebx, %esp - jmp .Lrestore_all_notrace + +.Lnmi_return: + CHECK_AND_APPLY_ESPFIX + RESTORE_ALL_NMI cr3_reg=%edi pop=4 + jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: @@ -1042,12 +1453,12 @@ ENTRY(nmi) pushl 16(%esp) .endr pushl %eax - SAVE_ALL + SAVE_ALL_NMI cr3_reg=%edi ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi - RESTORE_REGS + RESTORE_ALL_NMI cr3_reg=%edi lss 12+4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif @@ -1056,7 +1467,8 @@ END(nmi) ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7fff98fa5855..b5c60faf8429 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,6 +219,7 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index bbc796eb0a3b..eeeb9289c764 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -71,12 +71,7 @@ struct ldt_struct { static inline void *ldt_slot_va(int slot) { -#ifdef CONFIG_X86_64 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); -#else - BUG(); - return (void *)fix_to_virt(FIX_HOLE); -#endif } /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f6f6c63da62f..fd2a8c1b88bc 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -214,7 +214,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS, + SPECTRE_V2_IBRS_ENHANCED, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 685ffe8a0eaf..c399ea5eea41 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte) static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd); +#endif *pmdp = pmd; } @@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0)); +#endif return __pmd(xchg((pmdval_t *)xp, 0)); } #else @@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #ifdef CONFIG_SMP static inline pud_t native_pudp_get_and_clear(pud_t *xp) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0)); +#endif return __pud(xchg((pudval_t *)xp, 0)); } #else diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index f982ef808e7e..6deb6cd236e3 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -35,4 +35,7 @@ typedef union { #define PTRS_PER_PTE 1024 +/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */ +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) + #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f24df59c40b2..f2ca3139ca22 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); +#endif set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); } @@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) { union split_pud res, *orig = (union split_pud *)pudp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); +#endif + /* xchg acts as a barrier before setting of the high bits */ res.pud_low = xchg(&orig->pud_low, 0); res.pud_high = orig->pud_high; diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 6a59a6d0cc50..858358a82b14 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -21,9 +21,10 @@ typedef union { #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_PARAVIRT -#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) +#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \ + (pv_info.shared_kernel_pmd))) #else -#define SHARED_KERNEL_PMD 1 +#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif /* @@ -45,5 +46,6 @@ typedef union { #define PTRS_PER_PTE 512 #define MAX_POSSIBLE_PHYSMEM_BITS 36 +#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT) #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5715647fc4fe..a1cb3339da8d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); +void ptdump_walk_user_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) +#define debug_checkwx() do { } while (0) +#define debug_checkwx_user() do { } while (0) #endif /* @@ -640,8 +643,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgtbl(pgdp, pgd); +} +#else /* CONFIG_PAGE_TABLE_ISOLATION */ +static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + #endif /* __ASSEMBLY__ */ + #ifdef CONFIG_X86_32 # include #else @@ -1154,6 +1180,70 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); +} + +static inline int pgd_large(pgd_t pgd) { return 0; } + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 88a056b01db4..b3ec519e3982 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); -static inline int pgd_large(pgd_t pgd) { return 0; } - /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index d9a001a4a872..b0bc0fff5f1f 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -50,13 +50,18 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ & PMD_MASK) -#define PKMAP_BASE \ +#define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE) + +#define PKMAP_BASE \ + ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) +# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 3c5385f9a88f..acb6970e7bcf 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -132,90 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } -#ifdef CONFIG_PAGE_TABLE_ISOLATION -/* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages - * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and - * the user one is in the last 4k. To switch between them, you - * just need to flip the 12th bit in their addresses. - */ -#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT - -/* - * This generates better code than the inline assembly in - * __set_bit(). - */ -static inline void *ptr_set_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr |= BIT(bit); - return (void *)__ptr; -} -static inline void *ptr_clear_bit(void *ptr, int bit) -{ - unsigned long __ptr = (unsigned long)ptr; - - __ptr &= ~BIT(bit); - return (void *)__ptr; -} - -static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) -{ - return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) -{ - return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) -{ - return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} - -static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) -{ - return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -} -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ - -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * - * Returns true for parts of the PGD that map userspace and - * false for the parts that map the kernel. - */ -static inline bool pgdp_maps_userspace(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); -} - -#ifdef CONFIG_PAGE_TABLE_ISOLATION -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); - -/* - * Take a PGD location (pgdp) and a pgd value that needs to be set there. - * Populates the user and returns the resulting PGD that must be set in - * the kernel copy of the page tables. - */ -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -{ - if (!static_cpu_has(X86_FEATURE_PTI)) - return pgd; - return __pti_set_user_pgd(pgdp, pgd); -} -#else -static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -{ - return pgd; -} -#endif - static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; @@ -226,7 +142,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) } pgd = native_make_pgd(native_p4d_val(p4d)); - pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); *p4dp = native_make_p4d(native_pgd_val(pgd)); } @@ -237,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pti_set_user_pgd(pgdp, pgd); + *pgdp = pti_set_user_pgtbl(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) @@ -255,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Level 4 access. */ -static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 054765ab2da2..04edd2d58211 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY_L5 -112UL #define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) #define __VMALLOC_BASE_L4 0xffffc90000000000UL #define __VMALLOC_BASE_L5 0xffa0000000000000UL @@ -153,4 +154,6 @@ extern unsigned int ptrs_per_p4d; #define EARLY_DYNAMIC_PAGE_TABLES 64 +#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 99fff853c944..b64acb08a62b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -50,6 +50,7 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +#ifdef CONFIG_X86_PAE + +/* + * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't + * use it here. + */ + +#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) +#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) + +/* + * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. + * All other bits are Reserved MBZ + */ +#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ + _PAGE_PWT | _PAGE_PCD | \ + _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) + +#else +/* No need to mask any bits for !PAE */ +#define PGD_ALLOWED_BITS (~0ULL) +#endif + static inline pgd_t native_make_pgd(pgdval_t val) { - return (pgd_t) { val }; + return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { - return pgd.pgd; + return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 625a52a5594f..02c2cbda4a74 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -39,10 +39,6 @@ #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define X86_CR3_PTI_PCID_USER_BIT 11 -#endif - #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save @@ -53,4 +49,8 @@ #define CR3_NOFLUSH 0 #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_PCID_USER_BIT 11 +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cfd29ee8c3da..59663c08c949 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -966,6 +966,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); +extern void free_kernel_image_pages(void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 38a17f1d5c9d..5df09a0b80b8 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -6,10 +6,9 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); -extern void pti_clone_kernel_text(void); +extern void pti_finalize(void); #else static inline void pti_check_boottime_disable(void) { } -static inline void pti_clone_kernel_text(void) { } #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..4a911a382ade 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,6 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; +extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index bd090367236c..34cffcef7375 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index eb5f7999a893..36bd243843d6 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -87,15 +87,25 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) #endif /* This is used when switching tasks or entering/exiting vm86 mode. */ -static inline void update_sp0(struct task_struct *task) +static inline void update_task_stack(struct task_struct *task) { - /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task->thread.sp0); + else + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else + /* + * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That + * doesn't work on x86-32 because sp1 and + * cpu_current_top_of_stack have different values (because of + * the non-zero stack-padding on 32bit). + */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif + } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index dcb008c320fe..01de31db300d 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,4 +103,9 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); + + /* Offset for sp0 and sp1 into the tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a4a3be399f4b..82826f2275cc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,8 +46,14 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b2dcd161f514..3b9405e7ba2b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -65,8 +65,6 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); - OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); - OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5c0ea39311fe..405a9a61bb89 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -130,6 +130,7 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt @@ -313,23 +314,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -/* Check for Skylake-like CPUs (for RSB handling) */ -static bool __init is_skylake_era(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - return true; - } - } - return false; -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -349,6 +333,13 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_IBRS_ENHANCED; + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + goto specv2_set_mode; + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -386,26 +377,20 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } +specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP nor PTI are available, there is a risk of - * hitting userspace addresses in the RSB after a context switch - * from a shallow call stack to a deeper one. To prevent this fill - * the entire RSB, even when using IBRS. + * If spectre v2 protection has been enabled, unconditionally fill + * RSB during a context switch; this protects against two independent + * issues: * - * Skylake era CPUs have a separate issue with *underflow* of the - * RSB, when they will predict 'ret' targets from the generic BTB. - * The proper mitigation for this is IBRS. If IBRS is not supported - * or deactivated in favour of retpolines the RSB fill on context - * switch is required. + * - RSB underflow (and switch to BTB) on Skylake+ + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs */ - if ((!boot_cpu_has(X86_FEATURE_PTI) && - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); - } + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { @@ -415,9 +400,16 @@ retpoline_auto: /* * Retpoline means the kernel is safe because it has no indirect - * branches. But firmware isn't, so use IBRS to protect that. + * branches. Enhanced IBRS protects firmware too, so, enable restricted + * speculation around firmware calls only when Enhanced IBRS isn't + * supported. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eb4cb3efd20e..df28e931d732 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1005,6 +1005,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (x86_match_cpu(cpu_no_meltdown)) return; @@ -1804,11 +1807,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, curr); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index abe6df15a8fb..30f9cb2c0b55 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -512,11 +512,18 @@ ENTRY(initial_code) ENTRY(setup_once_ref) .long setup_once +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif /* * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE + .align PGD_ALIGN #ifdef CONFIG_X86_PAE .globl initial_pg_pmd initial_pg_pmd: @@ -526,14 +533,17 @@ initial_pg_pmd: initial_page_table: .fill 1024,4,0 #endif + .align PGD_ALIGN initial_pg_fixmap: .fill 1024,4,0 +.globl swapper_pg_dir + .align PGD_ALIGN +swapper_pg_dir: + .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 .globl empty_zero_page empty_zero_page: .fill 4096,1,0 -.globl swapper_pg_dir -swapper_pg_dir: - .fill 1024,4,0 EXPORT_SYMBOL(empty_zero_page) /* @@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE + .align PGD_ALIGN ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c9b14020f4dd..733e6ace0fa4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -100,6 +100,102 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#endif /* CONFIG_X86_PAE */ + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. @@ -115,9 +211,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - bool is_vmalloc, had_top_level_entry; unsigned long va; + bool is_vmalloc; spinlock_t *ptl; pgd_t *pgd; int i; @@ -131,13 +226,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) */ WARN_ON(ldt->slot != -1); + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + /* * Did we already have the top level entry allocated? We can't * use pgd_none() for this because it doens't do anything on * 4-level page table kernels. */ pgd = pgd_offset(mm, LDT_BASE_ADDR); - had_top_level_entry = (pgd->pgd != 0); is_vmalloc = is_vmalloc_addr(ldt->entries); @@ -172,41 +269,31 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) pte_unmap_unlock(ptep, ptl); } - if (mm->context.ldt) { - /* - * We already had an LDT. The top-level entry should already - * have been allocated and synchronized with the usermode - * tables. - */ - WARN_ON(!had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) - WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); - } else { - /* - * This is the first time we're mapping an LDT for this process. - * Sync the pgd to the usermode tables. - */ - WARN_ON(had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) { - WARN_ON(kernel_to_user_pgdp(pgd)->pgd); - set_pgd(kernel_to_user_pgdp(pgd), *pgd); - } - } + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); ldt->slot = slot; -#endif return 0; } +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; - unsigned long end = start + (1UL << PGDIR_SHIFT); + unsigned long end = LDT_END_ADDR; if (!static_cpu_has(X86_FEATURE_PTI)) return; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d1ab07ec8c9a..5409c2800ab5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -56,7 +56,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_page((unsigned long)image->arch.pgd); + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { - image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 30ca2d1a9231..c93fcfdf1673 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, -#ifdef CONFIG_X86_64 /* * .sp1 is cpu_current_top_of_stack. The init task never * runs user code, but cpu_current_top_of_stack should still * be well defined before the first context switch. */ .sp1 = TOP_OF_INIT_STACK, -#endif #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..2924fd447e61 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - update_sp0(next_p); + update_task_stack(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 12bb445fb98d..476e3ddf8890 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ - update_sp0(next_p); + update_task_stack(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9d0b5af7db91..1c03e4aa6474 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - update_sp0(tsk); + update_task_stack(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - update_sp0(tsk); + update_task_stack(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5e1458f609a1..8bde0a419f86 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -55,19 +55,22 @@ jiffies_64 = jiffies; * so we can enable protection checks as well as retain 2MB large page * mappings for kernel text. */ -#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_RODATA_END \ +#define X86_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ - __end_rodata_hpage_align = .; + __end_rodata_hpage_align = .; \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); #else -#define X64_ALIGN_RODATA_BEGIN -#define X64_ALIGN_RODATA_END +#define X86_ALIGN_RODATA_BEGIN +#define X86_ALIGN_RODATA_END \ + . = ALIGN(PAGE_SIZE); \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END @@ -141,9 +144,9 @@ SECTIONS /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); - X64_ALIGN_RODATA_BEGIN + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_RODATA_END + X86_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2f3c9196b834..a12afff146d1 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = { [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD ((pgd_t *) &init_top_pgt) + #else /* CONFIG_X86_64 */ enum address_markers_idx { @@ -120,6 +122,9 @@ enum address_markers_idx { VMALLOC_END_NR, #ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, #endif CPU_ENTRY_AREA_NR, FIXADDR_START_NR, @@ -133,12 +138,17 @@ static struct addr_marker address_markers[] = { [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, #ifdef CONFIG_HIGHMEM [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, [END_OF_SPACE_NR] = { -1, NULL } }; +#define INIT_PGD (swapper_pg_dir) + #endif /* !CONFIG_X86_64 */ /* Multipliers for offsets within the PTEs */ @@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx) static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx, bool dmesg) { -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_top_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif + pgd_t *start = INIT_PGD; pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) } EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); -static void ptdump_walk_user_pgd_level_checkwx(void) +void ptdump_walk_user_pgd_level_checkwx(void) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - pgd_t *pgd = (pgd_t *) &init_top_pgt; + pgd_t *pgd = INIT_PGD; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!(__supported_pte_mask & _PAGE_NX) || + !static_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); @@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { ptdump_walk_pgd_level_core(NULL, NULL, true, false); - ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) @@ -609,6 +615,9 @@ static int __init pt_dump_init(void) # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; +# ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +# endif #endif return 0; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2aafa6ab6103..db1c042e9853 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -317,8 +317,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Synchronize this task's top level page-table * with the 'reference' page table. diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cee58a972cb2..74b157ac078d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -773,13 +773,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(void *begin, void *end) +{ + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + + free_init_pages("unused kernel image", begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); +} + void __ref free_initmem(void) { e820__reallocate_tables(); - free_init_pages("unused kernel", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + free_kernel_image_pages(&__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a688617c727e..dd519f372169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1283,20 +1283,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(text_end)), - (unsigned long) __va(__pa_symbol(rodata_start))); - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(rodata_end)), - (unsigned long) __va(__pa_symbol(_sdata))); + free_kernel_image_pages((void *)text_end, (void *)rodata_start); + free_kernel_image_pages((void *)rodata_end, (void *)_sdata); debug_checkwx(); - - /* - * Do this after all of the manipulation of the - * kernel text page tables are complete. - */ - pti_clone_kernel_text(); } int kern_addr_valid(unsigned long addr) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3bded76e8d5c..0a74996a1149 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + /* Has caller explicitly disabled alias checking? */ + if (in_flag & CPA_NO_CHECK_ALIAS) + checkalias = 0; ret = __change_page_attr_set_clr(&cpa, checkalias); @@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + int cpa_flags = CPA_NO_CHECK_ALIAS; + + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0f1683fcb196..3ef095c70ae3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd) */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +/* + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. + */ +#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) + void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); @@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 - +#define PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ -static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) + for (i = 0; i < count; i++) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); @@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) } } -static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; @@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - for(i = 0; i < PREALLOCATED_PMDS; i++) { + for (i = 0; i < count; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(gfp); if (!pmd) failed = true; @@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) } if (failed) { - free_pmds(mm, pmds); + free_pmds(mm, pmds, count); return -ENOMEM; } @@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) * preallocate which never got a corresponding vma will need to be * freed manually. */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + *pgdp = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < PREALLOCATED_PMDS; i++) { - pgd_t pgd = pgdp[i]; + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); +#ifdef CONFIG_PAGE_TABLE_ISOLATION - pgdp[i] = native_make_pgd(0); + if (!static_cpu_has(X86_FEATURE_PTI)) + return; - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - mm_dec_nr_pmds(mm); - } - } + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) @@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. @@ -340,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, + PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -352,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void) static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } @@ -372,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; + pmd_t *u_pmds[PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(); @@ -381,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(mm, pmds) != 0) + if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (paravirt_pgd_alloc(mm) != 0) + if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they @@ -396,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; +out_free_user_pmds: + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(mm, pmds); + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4d418e705878..d58b4aba9510 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -54,6 +54,16 @@ #define __GFP_NOTRACK 0 #endif +/* + * Define the page-table levels we clone for user-space on 32 + * and 64 bit. + */ +#ifdef CONFIG_X86_64 +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD +#else +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE +#endif + static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) @@ -117,7 +127,7 @@ enable: setup_force_cpu_cap(X86_FEATURE_PTI); } -pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page @@ -176,7 +186,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); - if (!new_p4d_page) + if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); @@ -195,13 +205,17 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + p4d_t *p4d; pud_t *pud; + p4d = pti_user_pagetable_walk_p4d(address); + if (!p4d) + return NULL; + BUILD_BUG_ON(p4d_large(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); - if (!new_pud_page) + if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); @@ -215,7 +229,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); - if (!new_pmd_page) + if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); @@ -224,7 +238,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) return pmd_offset(pud, address); } -#ifdef CONFIG_X86_VSYSCALL_EMULATION /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. @@ -237,9 +250,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pmd_t *pmd; pte_t *pte; + pmd = pti_user_pagetable_walk_pmd(address); + if (!pmd) + return NULL; + /* We can't do anything sensible if we hit a large mapping. */ if (pmd_large(*pmd)) { WARN_ON(1); @@ -262,6 +279,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) return pte; } +#ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; @@ -282,8 +300,14 @@ static void __init pti_setup_vsyscall(void) static void __init pti_setup_vsyscall(void) { } #endif +enum pti_clone_level { + PTI_CLONE_PMD, + PTI_CLONE_PTE, +}; + static void -pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +pti_clone_pgtable(unsigned long start, unsigned long end, + enum pti_clone_level level) { unsigned long addr; @@ -291,59 +315,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ - for (addr = start; addr < end; addr += PMD_SIZE) { + for (addr = start; addr < end;) { + pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; + /* Overflow check */ + if (addr < start) + break; + pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; + pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (pud_none(*pud)) { + addr += PUD_SIZE; continue; + } + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (pmd_none(*pmd)) { + addr += PMD_SIZE; continue; + } - target_pmd = pti_user_pagetable_walk_pmd(addr); - if (WARN_ON(!target_pmd)) - return; + if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; - /* - * Only clone present PMDs. This ensures only setting - * _PAGE_GLOBAL on present PMDs. This should only be - * called on well-known addresses anyway, so a non- - * present PMD would be a surprise. - */ - if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) - return; + /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; - /* - * Setting 'target_pmd' below creates a mapping in both - * the user and kernel page tables. It is effectively - * global, so set it as global in both copies. Note: - * the X86_FEATURE_PGE check is not _required_ because - * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with - * code that only set this bit when supported. - */ - if (boot_cpu_has(X86_FEATURE_PGE)) - *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); - /* - * Copy the PMD. That is, the kernelmode and usermode - * tables will share the last-level page tables of this - * address range - */ - *target_pmd = pmd_clear_flags(*pmd, clear); + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = *pmd; + + addr += PMD_SIZE; + + } else if (level == PTI_CLONE_PTE) { + + /* Walk the page-table down to the pte level */ + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + addr += PAGE_SIZE; + continue; + } + + /* Only clone present PTEs */ + if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) + return; + + /* Allocate PTE in the user page-table */ + target_pte = pti_user_pagetable_walk_pte(addr); + if (WARN_ON(!target_pte)) + return; + + /* Set GLOBAL bit in both PTEs */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pte = pte_set_flags(*pte, _PAGE_GLOBAL); + + /* Clone the PTE */ + *target_pte = *pte; + + addr += PAGE_SIZE; + + } else { + BUG(); + } } } +#ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. @@ -354,6 +424,9 @@ static void __init pti_clone_p4d(unsigned long addr) pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); + if (!user_p4d) + return; + kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; @@ -367,6 +440,25 @@ static void __init pti_clone_user_shared(void) pti_clone_p4d(CPU_ENTRY_AREA_BASE); } +#else /* CONFIG_X86_64 */ + +/* + * On 32 bit PAE systems with 1GB of Kernel address space there is only + * one pgd/p4d for the whole kernel. Cloning that would map the whole + * address space into the user page-tables, making PTI useless. So clone + * the page-table on the PMD level to prevent that. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned long start, end; + + start = CPU_ENTRY_AREA_BASE; + end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); + + pti_clone_pgtable(start, end, PTI_CLONE_PMD); +} +#endif /* CONFIG_X86_64 */ + /* * Clone the ESPFIX P4D into the user space visible page table */ @@ -380,11 +472,11 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry and irqentry text and force it RO. */ -static void __init pti_clone_entry_text(void) +static void pti_clone_entry_text(void) { - pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, - _PAGE_RW); + pti_clone_pgtable((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, + PTI_CLONE_PMD); } /* @@ -434,11 +526,18 @@ static inline bool pti_kernel_image_global_ok(void) return true; } +/* + * This is the only user for these and it is not arch-generic + * like the other set_memory.h functions. Just extern them. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +extern int set_memory_global(unsigned long addr, int numpages); + /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ -void pti_clone_kernel_text(void) +static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally @@ -446,7 +545,8 @@ void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long end_clone = (unsigned long)__end_rodata_aligned; + unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); if (!pti_kernel_image_global_ok()) return; @@ -458,14 +558,18 @@ void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end, _PAGE_RW); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + + /* + * pti_clone_pgtable() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } -/* - * This is the only user for it and it is not arch-generic like - * the other set_memory.h functions. Just extern it. - */ -extern int set_memory_nonglobal(unsigned long addr, int numpages); void pti_set_kernel_image_nonglobal(void) { /* @@ -477,9 +581,11 @@ void pti_set_kernel_image_nonglobal(void) unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); - if (pti_kernel_image_global_ok()) - return; - + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } @@ -493,6 +599,28 @@ void __init pti_init(void) pr_info("enabled\n"); +#ifdef CONFIG_X86_32 + /* + * We check for X86_FEATURE_PCID here. But the init-code will + * clear the feature flag on 32 bit because the feature is not + * supported on 32 bit anyway. To print the warning we need to + * check with cpuid directly again. + */ + if (cpuid_ecx(0x1) & BIT(17)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "************************************************************\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "************************************************************\n"); + } +#endif + pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ @@ -502,3 +630,22 @@ void __init pti_init(void) pti_setup_espfix64(); pti_setup_vsyscall(); } + +/* + * Finalize the kernel mappings in the userspace page-table. Some of the + * mappings for the kernel image might have changed since pti_init() + * cloned them. This is because parts of the kernel image have been + * mapped RO and/or NX. These changes need to be cloned again to the + * userspace page-table. + */ +void pti_finalize(void) +{ + /* + * We need to clone everything (again) that maps parts of the + * kernel image. + */ + pti_clone_entry_text(); + pti_clone_kernel_text(); + + debug_checkwx_user(); +} diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 220e97841e49..3a6c8ebc8032 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__tracedata_(start|end)|" "__(start|stop)_notes|" "__end_rodata|" + "__end_rodata_aligned|" "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 diff --git a/include/linux/pti.h b/include/linux/pti.h index 0174883a935a..1a941efcaa62 100644 --- a/include/linux/pti.h +++ b/include/linux/pti.h @@ -6,6 +6,7 @@ #include #else static inline void pti_init(void) { } +static inline void pti_finalize(void) { } #endif #endif diff --git a/init/main.c b/init/main.c index 5e13c544bbf4..f86b64c35d38 100644 --- a/init/main.c +++ b/init/main.c @@ -1065,6 +1065,13 @@ static int __ref kernel_init(void *unused) jump_label_invalidate_initmem(); free_initmem(); mark_readonly(); + + /* + * Kernel mappings are now finalized - update the userspace page-table + * to finalize PTI. + */ + pti_finalize(); + system_state = SYSTEM_RUNNING; numa_default_policy(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a790ef4be74e..3222193c46c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6939,9 +6939,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); if ((unsigned int)poison <= 0xFF) - memset(pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); } if (pages && s) diff --git a/security/Kconfig b/security/Kconfig index c4302067a3ad..afa91c6f06bb 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -57,7 +57,7 @@ config SECURITY_NETWORK config PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y - depends on X86_64 && !UML + depends on X86 && !UML help This feature reduces the number of hardware side channels by ensuring that the majority of kernel addresses are not mapped