From e046eb0c9bf26d94be9e4592c00c7a78b0fa9bfd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 12:56:18 +0100 Subject: [PATCH 001/104] arm64: mm: Use non-global mappings for kernel space In preparation for unmapping the kernel whilst running in userspace, make the kernel mappings non-global so we can avoid expensive TLB invalidation on kernel exit to userspace. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 12 ++++++++++-- arch/arm64/include/asm/pgtable-prot.h | 21 +++++++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7803343e5881..77a27af01371 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -78,8 +78,16 @@ /* * Initial memory map attributes. */ -#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define SWAPPER_PTE_FLAGS (_SWAPPER_PTE_FLAGS | PTE_NG) +#define SWAPPER_PMD_FLAGS (_SWAPPER_PMD_FLAGS | PMD_SECT_NG) +#else +#define SWAPPER_PTE_FLAGS _SWAPPER_PTE_FLAGS +#define SWAPPER_PMD_FLAGS _SWAPPER_PMD_FLAGS +#endif #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 0a5635fb0ef9..22a926825e3f 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -34,8 +34,16 @@ #include -#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define PROT_DEFAULT (_PROT_DEFAULT | PTE_NG) +#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_SECT_NG) +#else +#define PROT_DEFAULT _PROT_DEFAULT +#define PROT_SECT_DEFAULT _PROT_SECT_DEFAULT +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -48,6 +56,7 @@ #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _HYP_PAGE_DEFAULT (_PAGE_DEFAULT & ~PTE_NG) #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) @@ -55,15 +64,15 @@ #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) +#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) +#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) +#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) #define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) From 376133b7edc20f237a42e4c72415cc9e8c0a9704 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:04:48 +0100 Subject: [PATCH 002/104] arm64: mm: Temporarily disable ARM64_SW_TTBR0_PAN We're about to rework the way ASIDs are allocated, switch_mm is implemented and low-level kernel entry/exit is handled, so keep the ARM64_SW_TTBR0_PAN code out of the way whilst we do the heavy lifting. It will be re-enabled in a subsequent patch. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..7e7d7fd152c4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -910,6 +910,7 @@ endif config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + depends on BROKEN # Temporary while switch_mm is reworked help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved From 7655abb953860485940d4de74fb45a8192149bb6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:19:09 +0100 Subject: [PATCH 003/104] arm64: mm: Move ASID from TTBR0 to TTBR1 In preparation for mapping kernelspace and userspace with different ASIDs, move the ASID to TTBR1 and update switch_mm to context-switch TTBR0 via an invalid mapping (the zero page). Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu_context.h | 7 +++++++ arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/include/asm/proc-fns.h | 6 ------ arch/arm64/mm/proc.S | 9 ++++++--- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 9d155fa9a507..aa39c126c0d0 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -57,6 +57,13 @@ static inline void cpu_set_reserved_ttbr0(void) isb(); } +static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUG_ON(pgd == swapper_pg_dir); + cpu_set_reserved_ttbr0(); + cpu_do_switch_mm(virt_to_phys(pgd),mm); +} + /* * TCR.T0SZ value to use when the ID map is active. Usually equals * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index eb0c2bd90de9..8df4cb6ac6f7 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -272,6 +272,7 @@ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) +#define TCR_A1 (UL(1) << 22) #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h index 14ad6e4e87d1..16cef2e8449e 100644 --- a/arch/arm64/include/asm/proc-fns.h +++ b/arch/arm64/include/asm/proc-fns.h @@ -35,12 +35,6 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr); #include -#define cpu_switch_mm(pgd,mm) \ -do { \ - BUG_ON(pgd == swapper_pg_dir); \ - cpu_do_switch_mm(virt_to_phys(pgd),mm); \ -} while (0) - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* __ASM_PROCFNS_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 95233dfc4c39..a8a64898a2aa 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -139,9 +139,12 @@ ENDPROC(cpu_do_resume) */ ENTRY(cpu_do_switch_mm) pre_ttbr0_update_workaround x0, x2, x3 + mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id - bfi x0, x1, #48, #16 // set the ASID - msr ttbr0_el1, x0 // set TTBR0 + bfi x2, x1, #48, #16 // set the ASID + msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) + isb + msr ttbr0_el1, x0 // now update TTBR0 isb post_ttbr0_update_workaround ret @@ -224,7 +227,7 @@ ENTRY(__cpu_setup) * both user and kernel. */ ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 + TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1 tcr_set_idmap_t0sz x10, x9 /* From 85d13c001497b481b4a8cc5d7db243cc44ac2bd8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:29:06 +0100 Subject: [PATCH 004/104] arm64: mm: Remove pre_ttbr0_update_workaround for Falkor erratum #E1003 The pre_ttbr0_update_workaround hook is called prior to context-switching TTBR0 because Falkor erratum E1003 can cause TLB allocation with the wrong ASID if both the ASID and the base address of the TTBR are updated at the same time. With the ASID sitting safely in TTBR1, we no longer update things atomically, so we can remove the pre_ttbr0_update_workaround macro as it's no longer required. The erratum infrastructure and documentation is left around for #E1003, as it will be required by the entry trampoline code in a future patch. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 22 ---------------------- arch/arm64/include/asm/mmu_context.h | 2 -- arch/arm64/mm/context.c | 11 ----------- arch/arm64/mm/proc.S | 1 - 4 files changed, 36 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d886677..e1fa5db858b7 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -478,27 +477,6 @@ alternative_endif .endm /* - * Errata workaround prior to TTBR0_EL1 update - * - * val: TTBR value with new BADDR, preserved - * tmp0: temporary register, clobbered - * tmp1: other temporary register, clobbered - */ - .macro pre_ttbr0_update_workaround, val, tmp0, tmp1 -#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 -alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 - mrs \tmp0, ttbr0_el1 - mov \tmp1, #FALKOR_RESERVED_ASID - bfi \tmp0, \tmp1, #48, #16 // reserved ASID + old BADDR - msr ttbr0_el1, \tmp0 - isb - bfi \tmp0, \val, #0, #48 // reserved ASID + new BADDR - msr ttbr0_el1, \tmp0 - isb -alternative_else_nop_endif -#endif - .endm - /* * Errata workaround post TTBR0_EL1 update. */ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index aa39c126c0d0..da29766a181c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -19,8 +19,6 @@ #ifndef __ASM_MMU_CONTEXT_H #define __ASM_MMU_CONTEXT_H -#define FALKOR_RESERVED_ASID 1 - #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 6f4017046323..78a2dc596fee 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -79,13 +79,6 @@ void verify_cpu_asid_bits(void) } } -static void set_reserved_asid_bits(void) -{ - if (IS_ENABLED(CONFIG_QCOM_FALKOR_ERRATUM_1003) && - cpus_have_const_cap(ARM64_WORKAROUND_QCOM_FALKOR_E1003)) - __set_bit(FALKOR_RESERVED_ASID, asid_map); -} - static void flush_context(unsigned int cpu) { int i; @@ -94,8 +87,6 @@ static void flush_context(unsigned int cpu) /* Update the list of reserved ASIDs and the ASID bitmap. */ bitmap_clear(asid_map, 0, NUM_USER_ASIDS); - set_reserved_asid_bits(); - for_each_possible_cpu(i) { asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0); /* @@ -254,8 +245,6 @@ static int asids_init(void) panic("Failed to allocate bitmap for %lu ASIDs\n", NUM_USER_ASIDS); - set_reserved_asid_bits(); - pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS); return 0; } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index a8a64898a2aa..f2ff0837577c 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -138,7 +138,6 @@ ENDPROC(cpu_do_resume) * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) - pre_ttbr0_update_workaround x0, x2, x3 mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id bfi x2, x1, #48, #16 // set the ASID From 158d495899ce55db453f682a8ac8390d5a426578 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:34:30 +0100 Subject: [PATCH 005/104] arm64: mm: Rename post_ttbr0_update_workaround The post_ttbr0_update_workaround hook applies to any change to TTBRx_EL1. Since we're using TTBR1 for the ASID, rename the hook to make it clearer as to what it's doing. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 5 ++--- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/proc.S | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index e1fa5db858b7..c45bc94f15d0 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -477,10 +477,9 @@ alternative_endif .endm /* -/* - * Errata workaround post TTBR0_EL1 update. + * Errata workaround post TTBRx_EL1 update. */ - .macro post_ttbr0_update_workaround + .macro post_ttbr_update_workaround #ifdef CONFIG_CAVIUM_ERRATUM_27456 alternative_if ARM64_WORKAROUND_CAVIUM_27456 ic iallu diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6d14b8f29b5f..804e43c9cb0b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -257,7 +257,7 @@ alternative_else_nop_endif * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache * corruption). */ - post_ttbr0_update_workaround + post_ttbr_update_workaround .endif 1: .if \el != 0 diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f2ff0837577c..3146dc96f05b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -145,7 +145,7 @@ ENTRY(cpu_do_switch_mm) isb msr ttbr0_el1, x0 // now update TTBR0 isb - post_ttbr0_update_workaround + post_ttbr_update_workaround ret ENDPROC(cpu_do_switch_mm) From 27a921e75711d924617269e0ba4adb8bae9fd0d1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:58:16 +0100 Subject: [PATCH 006/104] arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN With the ASID now installed in TTBR1, we can re-enable ARM64_SW_TTBR0_PAN by ensuring that we switch to a reserved ASID of zero when disabling user access and restore the active user ASID on the uaccess enable path. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 - arch/arm64/include/asm/asm-uaccess.h | 25 +++++++++++++++++-------- arch/arm64/include/asm/uaccess.h | 21 +++++++++++++++++---- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/lib/clear_user.S | 2 +- arch/arm64/lib/copy_from_user.S | 2 +- arch/arm64/lib/copy_in_user.S | 2 +- arch/arm64/lib/copy_to_user.S | 2 +- arch/arm64/mm/cache.S | 2 +- arch/arm64/xen/hypercall.S | 2 +- 10 files changed, 42 insertions(+), 21 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7e7d7fd152c4..a93339f5178f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -910,7 +910,6 @@ endif config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" - depends on BROKEN # Temporary while switch_mm is reworked help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index b3da6c886835..21b8cf304028 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -16,11 +16,20 @@ add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb + sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE + bic \tmp1, \tmp1, #(0xffff << 48) + msr ttbr1_el1, \tmp1 // set reserved ASID + isb .endm - .macro __uaccess_ttbr0_enable, tmp1 + .macro __uaccess_ttbr0_enable, tmp1, tmp2 get_thread_info \tmp1 ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + mrs \tmp2, ttbr1_el1 + extr \tmp2, \tmp2, \tmp1, #48 + ror \tmp2, \tmp2, #16 + msr ttbr1_el1, \tmp2 // set the active ASID + isb msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 isb .endm @@ -31,18 +40,18 @@ alternative_if_not ARM64_HAS_PAN alternative_else_nop_endif .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 alternative_if_not ARM64_HAS_PAN - save_and_disable_irq \tmp2 // avoid preemption - __uaccess_ttbr0_enable \tmp1 - restore_irq \tmp2 + save_and_disable_irq \tmp3 // avoid preemption + __uaccess_ttbr0_enable \tmp1, \tmp2 + restore_irq \tmp3 alternative_else_nop_endif .endm #else .macro uaccess_ttbr0_disable, tmp1 .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 .endm #endif @@ -56,8 +65,8 @@ alternative_if ARM64_ALT_PAN_NOT_UAO alternative_else_nop_endif .endm - .macro uaccess_enable_not_uao, tmp1, tmp2 - uaccess_ttbr0_enable \tmp1, \tmp2 + .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3 + uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(0) alternative_else_nop_endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index fc0f9eb66039..750a3b76a01c 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -107,15 +107,19 @@ static inline void __uaccess_ttbr0_disable(void) { unsigned long ttbr; + ttbr = read_sysreg(ttbr1_el1); /* reserved_ttbr0 placed at the end of swapper_pg_dir */ - ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; - write_sysreg(ttbr, ttbr0_el1); + write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); + isb(); + /* Set reserved ASID */ + ttbr &= ~(0xffffUL << 48); + write_sysreg(ttbr, ttbr1_el1); isb(); } static inline void __uaccess_ttbr0_enable(void) { - unsigned long flags; + unsigned long flags, ttbr0, ttbr1; /* * Disable interrupts to avoid preemption between reading the 'ttbr0' @@ -123,7 +127,16 @@ static inline void __uaccess_ttbr0_enable(void) * roll-over and an update of 'ttbr0'. */ local_irq_save(flags); - write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + ttbr0 = current_thread_info()->ttbr0; + + /* Restore active ASID */ + ttbr1 = read_sysreg(ttbr1_el1); + ttbr1 |= ttbr0 & (0xffffUL << 48); + write_sysreg(ttbr1, ttbr1_el1); + isb(); + + /* Restore user page table */ + write_sysreg(ttbr0, ttbr0_el1); isb(); local_irq_restore(flags); } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 804e43c9cb0b..d454d8ed45e4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -184,7 +184,7 @@ alternative_if ARM64_HAS_PAN alternative_else_nop_endif .if \el != 0 - mrs x21, ttbr0_el1 + mrs x21, ttbr1_el1 tst x21, #0xffff << 48 // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled @@ -248,7 +248,7 @@ alternative_else_nop_endif tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set .endif - __uaccess_ttbr0_enable x0 + __uaccess_ttbr0_enable x0, x1 .if \el == 0 /* diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index e88fb99c1561..8f9c4641e706 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -30,7 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) - uaccess_enable_not_uao x2, x3 + uaccess_enable_not_uao x2, x3, x4 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4b5d826895ff..69d86a80f3e2 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -64,7 +64,7 @@ end .req x5 ENTRY(__arch_copy_from_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index b24a830419ad..e442b531252a 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -65,7 +65,7 @@ end .req x5 ENTRY(raw_copy_in_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 351f0766f7a6..318f15d5c336 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -63,7 +63,7 @@ end .req x5 ENTRY(__arch_copy_to_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 7f1dbe962cf5..6cd20a8c0952 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -49,7 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) - uaccess_ttbr0_enable x2, x3 + uaccess_ttbr0_enable x2, x3, x4 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 401ceb71540c..acdbd2c9e899 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -101,7 +101,7 @@ ENTRY(privcmd_call) * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation * is enabled (it implies that hardware UAO and PAN disabled). */ - uaccess_ttbr0_enable x6, x7 + uaccess_ttbr0_enable x6, x7, x8 hvc XEN_IMM /* From 0c8ea531b7740754cf374ca8b7510655f569c5e3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 14:10:28 +0100 Subject: [PATCH 007/104] arm64: mm: Allocate ASIDs in pairs In preparation for separate kernel/user ASIDs, allocate them in pairs for each mm_struct. The bottom bit distinguishes the two: if it is set, then the ASID will map only userspace. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/mm/context.c | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 0d34bf0a89c7..01bfb184f2a8 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -17,6 +17,7 @@ #define __ASM_MMU_H #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ +#define USER_ASID_FLAG (UL(1) << 48) typedef struct { atomic64_t id; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 78a2dc596fee..1cb3bc92ae5c 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -39,7 +39,16 @@ static cpumask_t tlb_flush_pending; #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) #define ASID_FIRST_VERSION (1UL << asid_bits) -#define NUM_USER_ASIDS ASID_FIRST_VERSION + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define NUM_USER_ASIDS (ASID_FIRST_VERSION >> 1) +#define asid2idx(asid) (((asid) & ~ASID_MASK) >> 1) +#define idx2asid(idx) (((idx) << 1) & ~ASID_MASK) +#else +#define NUM_USER_ASIDS (ASID_FIRST_VERSION) +#define asid2idx(asid) ((asid) & ~ASID_MASK) +#define idx2asid(idx) asid2idx(idx) +#endif /* Get the ASIDBits supported by the current CPU */ static u32 get_cpu_asid_bits(void) @@ -98,7 +107,7 @@ static void flush_context(unsigned int cpu) */ if (asid == 0) asid = per_cpu(reserved_asids, i); - __set_bit(asid & ~ASID_MASK, asid_map); + __set_bit(asid2idx(asid), asid_map); per_cpu(reserved_asids, i) = asid; } @@ -153,16 +162,16 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) * We had a valid ASID in a previous life, so try to re-use * it if possible. */ - asid &= ~ASID_MASK; - if (!__test_and_set_bit(asid, asid_map)) + if (!__test_and_set_bit(asid2idx(asid), asid_map)) return newasid; } /* * Allocate a free ASID. If we can't find one, take a note of the - * currently active ASIDs and mark the TLBs as requiring flushes. - * We always count from ASID #1, as we use ASID #0 when setting a - * reserved TTBR0 for the init_mm. + * currently active ASIDs and mark the TLBs as requiring flushes. We + * always count from ASID #2 (index 1), as we use ASID #0 when setting + * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd + * pairs. */ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx); if (asid != NUM_USER_ASIDS) @@ -179,7 +188,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) set_asid: __set_bit(asid, asid_map); cur_idx = asid; - return asid | generation; + return idx2asid(asid) | generation; } void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) From fc0e1299da548b32440051f58f08e0c1eb7edd0b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 13:58:08 +0000 Subject: [PATCH 008/104] arm64: mm: Add arm64_kernel_unmapped_at_el0 helper In order for code such as TLB invalidation to operate efficiently when the decision to map the kernel at EL0 is determined at runtime, this patch introduces a helper function, arm64_kernel_unmapped_at_el0, to determine whether or not the kernel is mapped whilst running in userspace. Currently, this just reports the value of CONFIG_UNMAP_KERNEL_AT_EL0, but will later be hooked up to a fake CPU capability using a static key. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 01bfb184f2a8..c07954638658 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -19,6 +19,8 @@ #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ #define USER_ASID_FLAG (UL(1) << 48) +#ifndef __ASSEMBLY__ + typedef struct { atomic64_t id; void *vdso; @@ -32,6 +34,11 @@ typedef struct { */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) +static inline bool arm64_kernel_unmapped_at_el0(void) +{ + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0); +} + extern void paging_init(void); extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); @@ -42,4 +49,5 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, extern void *fixmap_remap_fdt(phys_addr_t dt_phys); extern void mark_linear_text_alias_ro(void); +#endif /* !__ASSEMBLY__ */ #endif From 9b0de864b5bc298ea53005ad812f3386f81aee9c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 14:13:33 +0100 Subject: [PATCH 009/104] arm64: mm: Invalidate both kernel and user ASIDs when performing TLBI Since an mm has both a kernel and a user ASID, we need to ensure that broadcast TLB maintenance targets both address spaces so that things like CoW continue to work with the uaccess primitives in the kernel. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index af1c76981911..9e82dd79c7db 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -23,6 +23,7 @@ #include #include +#include /* * Raw TLBI operations. @@ -54,6 +55,11 @@ #define __tlbi(op, ...) __TLBI_N(op, ##__VA_ARGS__, 1, 0) +#define __tlbi_user(op, arg) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi(op, (arg) | USER_ASID_FLAG); \ +} while (0) + /* * TLB Management * ============== @@ -115,6 +121,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ishst); __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); dsb(ish); } @@ -125,6 +132,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ishst); __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); dsb(ish); } @@ -151,10 +159,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) { - if (last_level) + if (last_level) { __tlbi(vale1is, addr); - else + __tlbi_user(vale1is, addr); + } else { __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } } dsb(ish); } @@ -194,6 +205,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm, unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); dsb(ish); } From c7b9adaf85f818d747eeff5145eb4095ccd587fb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:07:40 +0000 Subject: [PATCH 010/104] arm64: entry: Add exception trampoline page for exceptions from EL0 To allow unmapping of the kernel whilst running at EL0, we need to point the exception vectors at an entry trampoline that can map/unmap the kernel on entry/exit respectively. This patch adds the trampoline page, although it is not yet plugged into the vector table and is therefore unused. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 86 +++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 17 +++++++ 2 files changed, 103 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d454d8ed45e4..716b5ef42e29 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -943,6 +945,90 @@ __ni_sys_trace: .popsection // .entry.text +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" + + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + .endm + + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + /* + * We avoid running the post_ttbr_update_workaround here because the + * user and kernel ASIDs don't have conflicting mappings, so any + * "blessing" as described in: + * + * http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com + * + * will not hurt correctness. Whilst this may partially defeat the + * point of using split ASIDs in the first place, it avoids + * the hit of invalidating the entire I-cache on every return to + * userspace. + */ + .endm + + .macro tramp_ventry, regsize = 64 + .align 7 +1: + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif + tramp_map_kernel x30 + ldr x30, =vectors + prfm plil1strm, [x30, #(1b - tramp_vectors)] + msr vbar_el1, x30 + add x30, x30, #(1b - tramp_vectors) + isb + br x30 + .endm + + .macro tramp_exit, regsize = 64 + adr x30, tramp_vectors + msr vbar_el1, x30 + tramp_unmap_kernel x30 + .if \regsize == 64 + mrs x30, far_el1 + .endif + eret + .endm + + .align 11 +ENTRY(tramp_vectors) + .space 0x400 + + tramp_ventry + tramp_ventry + tramp_ventry + tramp_ventry + + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 +END(tramp_vectors) + +ENTRY(tramp_exit_native) + tramp_exit +END(tramp_exit_native) + +ENTRY(tramp_exit_compat) + tramp_exit 32 +END(tramp_exit_compat) + + .ltorg + .popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Special system call wrappers. */ diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7da3e5c366a0..6b4260f22aab 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -57,6 +57,17 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_end) = .; +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -113,6 +124,7 @@ SECTIONS HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT + TRAMP_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); @@ -214,6 +226,11 @@ SECTIONS . += RESERVED_TTBR0_SIZE; #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; From 51a0048beb449682d632d0af52a515adb9f9882e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:14:17 +0000 Subject: [PATCH 011/104] arm64: mm: Map entry trampoline into trampoline and kernel page tables The exception entry trampoline needs to be mapped at the same virtual address in both the trampoline page table (which maps nothing else) and also the kernel page table, so that we can swizzle TTBR1_EL1 on exceptions from and return to EL0. This patch maps the trampoline at a fixed virtual address in the fixmap area of the kernel virtual address space, which allows the kernel proper to be randomized with respect to the trampoline when KASLR is enabled. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/fixmap.h | 4 ++++ arch/arm64/include/asm/pgtable.h | 1 + arch/arm64/kernel/asm-offsets.c | 6 +++++- arch/arm64/mm/mmu.c | 23 +++++++++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 4052ec39e8db..8119b49be98d 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -58,6 +58,10 @@ enum fixed_addresses { FIX_APEI_GHES_NMI, #endif /* CONFIG_ACPI_APEI_GHES */ +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_TEXT, +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 149d05fb9421..774003b247ad 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -680,6 +680,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; /* * Encode and decode a swap entry: diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 71bf088f1e4b..af247d10252f 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -148,11 +149,14 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif return 0; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 267d2b79d52d..fe68a48c64cb 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -525,6 +525,29 @@ static int __init parse_rodata(char *arg) } early_param("rodata", parse_rodata); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __init map_entry_trampoline(void) +{ + extern char __entry_tramp_text_start[]; + + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); + + /* The trampoline is always mapped and can therefore be global */ + pgprot_val(prot) &= ~PTE_NG; + + /* Map only the text into the trampoline page table */ + memset(tramp_pg_dir, 0, PGD_SIZE); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, + prot, pgd_pgtable_alloc, 0); + + /* ...as well as the kernel page table */ + __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + return 0; +} +core_initcall(map_entry_trampoline); +#endif + /* * Create fine-grained mappings for the kernel. */ From 5b1f7fe41909cde40decad9f0e8ee585777a0538 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:20:21 +0000 Subject: [PATCH 012/104] arm64: entry: Explicitly pass exception level to kernel_ventry macro We will need to treat exceptions from EL0 differently in kernel_ventry, so rework the macro to take the exception level as an argument and construct the branch target using that. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 716b5ef42e29..b99fc928119c 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -71,7 +71,7 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_ventry label + .macro kernel_ventry, el, label, regsize = 64 .align 7 sub sp, sp, #S_FRAME_SIZE #ifdef CONFIG_VMAP_STACK @@ -84,7 +84,7 @@ tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - b \label + b el\()\el\()_\label 0: /* @@ -116,7 +116,7 @@ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif - b \label + b el\()\el\()_\label .endm .macro kernel_entry, el, regsize = 64 @@ -369,31 +369,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - kernel_ventry el1_sync_invalid // Synchronous EL1t - kernel_ventry el1_irq_invalid // IRQ EL1t - kernel_ventry el1_fiq_invalid // FIQ EL1t - kernel_ventry el1_error_invalid // Error EL1t + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t - kernel_ventry el1_sync // Synchronous EL1h - kernel_ventry el1_irq // IRQ EL1h - kernel_ventry el1_fiq_invalid // FIQ EL1h - kernel_ventry el1_error // Error EL1h + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error // Error EL1h - kernel_ventry el0_sync // Synchronous 64-bit EL0 - kernel_ventry el0_irq // IRQ 64-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 - kernel_ventry el0_error // Error 64-bit EL0 + kernel_ventry 0, sync // Synchronous 64-bit EL0 + kernel_ventry 0, irq // IRQ 64-bit EL0 + kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, error // Error 64-bit EL0 #ifdef CONFIG_COMPAT - kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 - kernel_ventry el0_irq_compat // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - kernel_ventry el0_error_compat // Error 32-bit EL0 + kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 #else - kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 - kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif END(vectors) From 4bf3286d29f3a88425d8d8cd53428cbb8f865f04 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:24:29 +0000 Subject: [PATCH 013/104] arm64: entry: Hook up entry trampoline to exception vectors Hook up the entry trampoline to our exception vectors so that all exceptions from and returns to EL0 go via the trampoline, which swizzles the vector base register accordingly. Transitioning to and from the kernel clobbers x30, so we use tpidrro_el0 and far_el1 as scratch registers for native tasks. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b99fc928119c..39e3873b8d5a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -73,6 +73,17 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + .if \el == 0 + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif + .endif +#endif + sub sp, sp, #S_FRAME_SIZE #ifdef CONFIG_VMAP_STACK /* @@ -119,6 +130,11 @@ b el\()\el\()_\label .endm + .macro tramp_alias, dst, sym + mov_q \dst, TRAMP_VALIAS + add \dst, \dst, #(\sym - .entry.tramp.text) + .endm + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 @@ -271,18 +287,20 @@ alternative_else_nop_endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif -1: alternative_else_nop_endif #endif +3: .endif msr elr_el1, x21 // set up the return data @@ -304,7 +322,22 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp - eret // return to kernel + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 + eret +#else + .if \el == 0 + bne 4f + msr far_el1, x30 + tramp_alias x30, tramp_exit_native + br x30 +4: + tramp_alias x30, tramp_exit_compat + br x30 + .else + eret + .endif +#endif .endm .macro irq_stack_entry From d1777e686ad10ba7c594304429c6045fb79255a1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:29:19 +0000 Subject: [PATCH 014/104] arm64: erratum: Work around Falkor erratum #E1003 in trampoline code We rely on an atomic swizzling of TTBR1 when transitioning from the entry trampoline to the kernel proper on an exception. We can't rely on this atomicity in the face of Falkor erratum #E1003, so on affected cores we can issue a TLB invalidation to invalidate the walk cache prior to jumping into the kernel. There is still the possibility of a TLB conflict here due to conflicting walk cache entries prior to the invalidation, but this doesn't appear to be the case on these CPUs in practice. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 17 +++++------------ arch/arm64/kernel/entry.S | 12 ++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..fdcc7b9bb15d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -522,20 +522,13 @@ config CAVIUM_ERRATUM_30115 config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y - select ARM64_PAN if ARM64_SW_TTBR0_PAN help On Falkor v1, an incorrect ASID may be cached in the TLB when ASID - and BADDR are changed together in TTBRx_EL1. The workaround for this - issue is to use a reserved ASID in cpu_do_switch_mm() before - switching to the new ASID. Saying Y here selects ARM64_PAN if - ARM64_SW_TTBR0_PAN is selected. This is done because implementing and - maintaining the E1003 workaround in the software PAN emulation code - would be an unnecessary complication. The affected Falkor v1 CPU - implements ARMv8.1 hardware PAN support and using hardware PAN - support versus software PAN emulation is mutually exclusive at - runtime. - - If unsure, say Y. + and BADDR are changed together in TTBRx_EL1. Since we keep the ASID + in TTBR1_EL1, this situation only occurs in the entry trampoline and + then only for entries in the walk cache, since the leaf translation + is unchanged. Work around the erratum by invalidating the walk cache + entries for the trampoline before entering the kernel proper. config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 39e3873b8d5a..ce56592b5f70 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -989,6 +989,18 @@ __ni_sys_trace: sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 + /* ASID already in \tmp[63:48] */ + movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12) + movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12) + /* 2MB boundary containing the vectors, so we nobble the walk cache */ + movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12) + isb + tlbi vae1, \tmp + dsb nsh +alternative_else_nop_endif +#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ .endm .macro tramp_unmap_kernel, tmp From 18011eac28c7cb31c87b86b7d0e5b01894405c7f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:33:28 +0000 Subject: [PATCH 015/104] arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks When unmapping the kernel at EL0, we use tpidrro_el0 as a scratch register during exception entry from native tasks and subsequently zero it in the kernel_ventry macro. We can therefore avoid zeroing tpidrro_el0 in the context-switch path for native tasks using the entry trampoline. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6b7dcf4310ac..583fd8154695 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -370,16 +370,14 @@ void tls_preserve_current_state(void) static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; - tls_preserve_current_state(); - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); } /* Restore the UAO state depending on next's addr_limit */ From ea1e3de85e94d711f63437c04624aa0e8de5c8b3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:38:19 +0000 Subject: [PATCH 016/104] arm64: entry: Add fake CPU feature for unmapping the kernel at EL0 Allow explicit disabling of the entry trampoline on the kernel command line (kpti=off) by adding a fake CPU feature (ARM64_UNMAP_KERNEL_AT_EL0) that can be used to toggle the alternative sequences in our entry code and avoid use of the trampoline altogether if desired. This also allows us to make use of a static key in arm64_kernel_unmapped_at_el0(). Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/mmu.h | 3 ++- arch/arm64/kernel/cpufeature.c | 41 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/entry.S | 9 +++---- 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 2ff7c5e8efab..b4537ffd1018 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -41,7 +41,8 @@ #define ARM64_WORKAROUND_CAVIUM_30115 20 #define ARM64_HAS_DCPOP 21 #define ARM64_SVE 22 +#define ARM64_UNMAP_KERNEL_AT_EL0 23 -#define ARM64_NCAPS 23 +#define ARM64_NCAPS 24 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index c07954638658..da6f12e40714 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -36,7 +36,8 @@ typedef struct { static inline bool arm64_kernel_unmapped_at_el0(void) { - return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0); + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && + cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); } extern void paging_init(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c5ba0097887f..9f0545dfe497 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -845,6 +845,40 @@ static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unus ID_AA64PFR0_FP_SHIFT) < 0; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + /* Forced on command line? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by command line option\n", + __kpti_forced > 0 ? "ON" : "OFF"); + return __kpti_forced > 0; + } + + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return true; + + return false; +} + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +__setup("kpti=", parse_kpti); +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -931,6 +965,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + { + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = unmap_kernel_at_el0, + }, +#endif { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ce56592b5f70..5d51bdbb2131 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -74,6 +74,7 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +alternative_if ARM64_UNMAP_KERNEL_AT_EL0 .if \el == 0 .if \regsize == 64 mrs x30, tpidrro_el0 @@ -82,6 +83,7 @@ mov x30, xzr .endif .endif +alternative_else_nop_endif #endif sub sp, sp, #S_FRAME_SIZE @@ -323,10 +325,9 @@ alternative_else_nop_endif ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp -#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 - eret -#else .if \el == 0 +alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x30 tramp_alias x30, tramp_exit_native @@ -334,10 +335,10 @@ alternative_else_nop_endif 4: tramp_alias x30, tramp_exit_compat br x30 +#endif .else eret .endif -#endif .endm .macro irq_stack_entry From 084eb77cd3a81134d02500977dc0ecc9277dc97d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:41:01 +0000 Subject: [PATCH 017/104] arm64: Kconfig: Add CONFIG_UNMAP_KERNEL_AT_EL0 Add a Kconfig entry to control use of the entry trampoline, which allows us to unmap the kernel whilst running in userspace and improve the robustness of KASLR. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fdcc7b9bb15d..3af1657fcac3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -833,6 +833,19 @@ config FORCE_MAX_ZONEORDER However for 4K, we choose a higher default value, 11 as opposed to 10, giving us 4M allocations matching the default size used by generic code. +config UNMAP_KERNEL_AT_EL0 + bool "Unmap kernel when running in userspace (aka \"KAISER\")" + default y + help + Some attacks against KASLR make use of the timing difference between + a permission fault which could arise from a page table entry that is + present in the TLB, and a translation fault which always requires a + page table walk. This option defends against these attacks by unmapping + the kernel whilst running in userspace, therefore forcing translation + faults for all of kernel space. + + If unsure, say Y. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on COMPAT From 7a4a0c1555b824e0d3dd72942481b1190abea604 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 27 Nov 2017 15:49:53 +0000 Subject: [PATCH 018/104] perf: arm_spe: Fail device probe when arm64_kernel_unmapped_at_el0() When running with the kernel unmapped whilst at EL0, the virtually-addressed SPE buffer is also unmapped, which can lead to buffer faults if userspace profiling is enabled and potentially also when writing back kernel samples unless an expensive drain operation is performed on exception return. For now, fail the SPE driver probe when arm64_kernel_unmapped_at_el0(). Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 8ce262fc2561..51b40aecb776 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -1164,6 +1164,15 @@ static int arm_spe_pmu_device_dt_probe(struct platform_device *pdev) struct arm_spe_pmu *spe_pmu; struct device *dev = &pdev->dev; + /* + * If kernelspace is unmapped when running at EL0, then the SPE + * buffer will fault and prematurely terminate the AUX session. + */ + if (arm64_kernel_unmapped_at_el0()) { + dev_warn_once(dev, "profiling buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n"); + return -EPERM; + } + spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL); if (!spe_pmu) { dev_err(dev, "failed to allocate spe_pmu\n"); From b519538dfefc2f8478a1bcb458459c861d431784 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 1 Dec 2017 17:33:48 +0000 Subject: [PATCH 019/104] arm64: mm: Introduce TTBR_ASID_MASK for getting at the ASID in the TTBR There are now a handful of open-coded masks to extract the ASID from a TTBR value, so introduce a TTBR_ASID_MASK and use that instead. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/asm-uaccess.h | 3 ++- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/uaccess.h | 4 ++-- arch/arm64/kernel/entry.S | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 21b8cf304028..f4f234b6155e 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -17,7 +18,7 @@ msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE - bic \tmp1, \tmp1, #(0xffff << 48) + bic \tmp1, \tmp1, #TTBR_ASID_MASK msr ttbr1_el1, \tmp1 // set reserved ASID isb .endm diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index da6f12e40714..6f7bdb89817f 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -18,6 +18,7 @@ #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ #define USER_ASID_FLAG (UL(1) << 48) +#define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 750a3b76a01c..6eadf55ebaf0 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -112,7 +112,7 @@ static inline void __uaccess_ttbr0_disable(void) write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); isb(); /* Set reserved ASID */ - ttbr &= ~(0xffffUL << 48); + ttbr &= ~TTBR_ASID_MASK; write_sysreg(ttbr, ttbr1_el1); isb(); } @@ -131,7 +131,7 @@ static inline void __uaccess_ttbr0_enable(void) /* Restore active ASID */ ttbr1 = read_sysreg(ttbr1_el1); - ttbr1 |= ttbr0 & (0xffffUL << 48); + ttbr1 |= ttbr0 & TTBR_ASID_MASK; write_sysreg(ttbr1, ttbr1_el1); isb(); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5d51bdbb2131..3eabcb194c87 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -205,7 +205,7 @@ alternative_else_nop_endif .if \el != 0 mrs x21, ttbr1_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR From 6c27c4082f4f70b9f41df4d0adf51128b40351df Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 6 Dec 2017 11:24:02 +0000 Subject: [PATCH 020/104] arm64: kaslr: Put kernel vectors address in separate data page The literal pool entry for identifying the vectors base is the only piece of information in the trampoline page that identifies the true location of the kernel. This patch moves it into a page-aligned region of the .rodata section and maps this adjacent to the trampoline text via an additional fixmap entry, which protects against any accidental leakage of the trampoline contents. Suggested-by: Ard Biesheuvel Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/fixmap.h | 1 + arch/arm64/kernel/entry.S | 14 ++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 5 ++++- arch/arm64/mm/mmu.c | 10 +++++++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 8119b49be98d..ec1e6d6fa14c 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -59,6 +59,7 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_DATA, FIX_ENTRY_TRAMP_TEXT, #define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 3eabcb194c87..031392ee5f47 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1030,7 +1030,13 @@ alternative_else_nop_endif msr tpidrro_el0, x30 // Restored in kernel_ventry .endif tramp_map_kernel x30 +#ifdef CONFIG_RANDOMIZE_BASE + adr x30, tramp_vectors + PAGE_SIZE +alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 + ldr x30, [x30] +#else ldr x30, =vectors +#endif prfm plil1strm, [x30, #(1b - tramp_vectors)] msr vbar_el1, x30 add x30, x30, #(1b - tramp_vectors) @@ -1073,6 +1079,14 @@ END(tramp_exit_compat) .ltorg .popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE + .pushsection ".rodata", "a" + .align PAGE_SHIFT + .globl __entry_tramp_data_start +__entry_tramp_data_start: + .quad vectors + .popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 6b4260f22aab..ddfd3c0942f7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -251,7 +251,10 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif - +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, + "Entry trampoline text too big") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fe68a48c64cb..916d9ced1c3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -541,8 +541,16 @@ static int __init map_entry_trampoline(void) __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, prot, pgd_pgtable_alloc, 0); - /* ...as well as the kernel page table */ + /* Map both the text and data into the kernel page table */ __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern char __entry_tramp_data_start[]; + + __set_fixmap(FIX_ENTRY_TRAMP_DATA, + __pa_symbol(__entry_tramp_data_start), + PAGE_KERNEL_RO); + } + return 0; } core_initcall(map_entry_trampoline); From 982aa7c5f0861bf56b2412ca341a13f44c238ba4 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:16 +0000 Subject: [PATCH 021/104] arm64: add kconfig symbol to configure physical address size ARMv8.2 introduces support for 52-bit physical addresses. To prepare for supporting this, add a new kconfig symbol to configure the physical address space size. The symbols will be used in subsequent patches. Currently the only choice is 48, a later patch will add the option of 52 once the required code is in place. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Tested-by: Bob Picco Reviewed-by: Bob Picco Acked-by: Marc Zyngier Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: folded minor patches into this one] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 16 ++++++++++++++++ arch/arm64/include/asm/pgtable-hwdef.h | 2 +- arch/arm64/include/asm/sparsemem.h | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..8dc937823eeb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -646,6 +646,22 @@ config ARM64_VA_BITS default 47 if ARM64_VA_BITS_47 default 48 if ARM64_VA_BITS_48 +choice + prompt "Physical address space size" + default ARM64_PA_BITS_48 + help + Choose the maximum physical address range that the kernel will + support. + +config ARM64_PA_BITS_48 + bool "48-bit" + +endchoice + +config ARM64_PA_BITS + int + default 48 if ARM64_PA_BITS_48 + config CPU_BIG_ENDIAN bool "Build big-endian kernel" help diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index eb0c2bd90de9..c1de9f67980b 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -196,7 +196,7 @@ /* * Highest possible physical address supported. */ -#define PHYS_MASK_SHIFT (48) +#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 74a9d301819f..b299929fe56c 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -17,7 +17,7 @@ #define __ASM_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM -#define MAX_PHYSMEM_BITS 48 +#define MAX_PHYSMEM_BITS CONFIG_ARM64_PA_BITS #define SECTION_SIZE_BITS 30 #endif From 787fd1d019b269af7912249231dfe34a5fe3e7c8 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:17 +0000 Subject: [PATCH 022/104] arm64: limit PA size to supported range We currently copy the physical address size from ID_AA64MMFR0_EL1.PARange directly into TCR.(I)PS. This will not work for 4k and 16k granule kernels on systems that support 52-bit physical addresses, since 52-bit addresses are only permitted with the 64k granule. To fix this, fall back to 48 bits when configuring the PA size when the kernel does not support 52-bit PAs. When it does, fall back to 52, to avoid similar problems in the future if the PA size is ever increased above 52. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: tcr_set_pa_size macro renamed to tcr_compute_pa_size] [catalin.marinas@arm.com: comments added to tcr_compute_pa_size] [catalin.marinas@arm.com: definitions added for TCR_*PS_SHIFT] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 18 ++++++++++++++++++ arch/arm64/include/asm/pgtable-hwdef.h | 2 ++ arch/arm64/include/asm/sysreg.h | 8 ++++++++ arch/arm64/kvm/hyp-init.S | 6 ++---- arch/arm64/kvm/hyp/s2-setup.c | 2 ++ arch/arm64/mm/proc.S | 6 ++---- 6 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d886677..04a92307e6c1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -350,6 +350,24 @@ alternative_endif #endif .endm +/* + * tcr_compute_pa_size - set TCR.(I)PS to the highest supported + * ID_AA64MMFR0_EL1.PARange value + * + * tcr: register with the TCR_ELx value to be updated + * pos: PARange bitfield position + * tmp{0,1}: temporary registers + */ + .macro tcr_compute_pa_size, tcr, pos, tmp0, tmp1 + mrs \tmp0, ID_AA64MMFR0_EL1 + // Narrow PARange to fit the PS field in TCR_ELx + ubfx \tmp0, \tmp0, #ID_AA64MMFR0_PARANGE_SHIFT, #3 + mov \tmp1, #ID_AA64MMFR0_PARANGE_MAX + cmp \tmp0, \tmp1 + csel \tmp0, \tmp1, \tmp0, hi + bfi \tcr, \tmp0, \pos, #3 + .endm + /* * Macro to perform a data cache maintenance for the interval * [kaddr, kaddr + size) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index c1de9f67980b..9be2e9371c52 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -272,6 +272,8 @@ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) +#define TCR_IPS_SHIFT 32 +#define TCR_IPS_MASK (UL(7) << TCR_IPS_SHIFT) #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 08cc88574659..ec144f480b39 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -471,6 +471,14 @@ #define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0 #define ID_AA64MMFR0_TGRAN16_NI 0x0 #define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1 +#define ID_AA64MMFR0_PARANGE_48 0x5 +#define ID_AA64MMFR0_PARANGE_52 0x6 + +#ifdef CONFIG_ARM64_PA_BITS_52 +#define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_52 +#else +#define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_48 +#endif /* id_aa64mmfr1 */ #define ID_AA64MMFR1_PAN_SHIFT 20 diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 3f9615582377..e2d1fe03662a 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -90,11 +90,9 @@ __do_hyp_init: bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH #endif /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in - * TCR_EL2. + * Set the PS bits in TCR_EL2. */ - mrs x5, ID_AA64MMFR0_EL1 - bfi x4, x5, #16, #3 + tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6 msr tcr_el2, x4 diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index a81f5e10fc8c..603e1ee83e89 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -32,6 +32,8 @@ u32 __hyp_text __init_stage2_translation(void) * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... */ parange = read_sysreg(id_aa64mmfr0_el1) & 7; + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; val |= parange << 16; /* Compute the actual PARange... */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 95233dfc4c39..4f133cb340dc 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -228,11 +228,9 @@ ENTRY(__cpu_setup) tcr_set_idmap_t0sz x10, x9 /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the IPS bits in - * TCR_EL1. + * Set the IPS bits in TCR_EL1. */ - mrs x9, ID_AA64MMFR0_EL1 - bfi x10, x9, #32, #3 + tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6 #ifdef CONFIG_ARM64_HW_AFDBM /* * Hardware update of the Access and Dirty bits. From 529c4b05a3cb2f324aac347042ee6d641478e946 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:18 +0000 Subject: [PATCH 023/104] arm64: handle 52-bit addresses in TTBR The top 4 bits of a 52-bit physical address are positioned at bits 2..5 in the TTBR registers. Introduce a couple of macros to move the bits there, and change all TTBR writers to use them. Leave TTBR0 PAN code unchanged, to avoid complicating it. A system with 52-bit PA will have PAN anyway (because it's ARMv8.1 or later), and a system without 52-bit PA can only use up to 48-bit PAs. A later patch in this series will add a kconfig dependency to ensure PAN is configured. In addition, when using 52-bit PA there is a special alignment requirement on the top-level table. We don't currently have any VA_BITS configuration that would violate the requirement, but one could be added in the future, so add a compile-time BUG_ON to check for it. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: added TTBR_BADD_MASK_52 comment] Signed-off-by: Catalin Marinas --- arch/arm/include/asm/kvm_mmu.h | 2 ++ arch/arm64/include/asm/assembler.h | 16 ++++++++++++++++ arch/arm64/include/asm/kvm_mmu.h | 2 ++ arch/arm64/include/asm/mmu_context.h | 2 +- arch/arm64/include/asm/pgtable-hwdef.h | 13 +++++++++++++ arch/arm64/include/asm/pgtable.h | 6 ++++++ arch/arm64/kernel/head.S | 6 ++++-- arch/arm64/kernel/hibernate-asm.S | 12 +++++++----- arch/arm64/kernel/hibernate.c | 2 +- arch/arm64/kvm/hyp-init.S | 3 ++- arch/arm64/mm/pgd.c | 8 ++++++++ arch/arm64/mm/proc.S | 13 ++++++++----- virt/kvm/arm/arm.c | 2 +- 13 files changed, 71 insertions(+), 16 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index fa6f2174276b..8dbec683638b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -221,6 +221,8 @@ static inline unsigned int kvm_get_vmid_bits(void) return 8; } +#define kvm_phys_to_vttbr(addr) (addr) + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 04a92307e6c1..49ea3def4bd1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -530,4 +530,20 @@ alternative_else_nop_endif #endif .endm +/* + * Arrange a physical address in a TTBR register, taking care of 52-bit + * addresses. + * + * phys: physical address, preserved + * ttbr: returns the TTBR value + */ + .macro phys_to_ttbr, phys, ttbr +#ifdef CONFIG_ARM64_PA_BITS_52 + orr \ttbr, \phys, \phys, lsr #46 + and \ttbr, \ttbr, #TTBR_BADDR_MASK_52 +#else + mov \ttbr, \phys +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 672c8684d5c2..747bfff92948 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -309,5 +309,7 @@ static inline unsigned int kvm_get_vmid_bits(void) return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 9d155fa9a507..accc2ff32a0e 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -51,7 +51,7 @@ static inline void contextidr_thread_switch(struct task_struct *next) */ static inline void cpu_set_reserved_ttbr0(void) { - unsigned long ttbr = __pa_symbol(empty_zero_page); + unsigned long ttbr = phys_to_ttbr(__pa_symbol(empty_zero_page)); write_sysreg(ttbr, ttbr0_el1); isb(); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9be2e9371c52..f92be11a209a 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -16,6 +16,8 @@ #ifndef __ASM_PGTABLE_HWDEF_H #define __ASM_PGTABLE_HWDEF_H +#include + /* * Number of page-table levels required to address 'va_bits' wide * address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT) @@ -279,4 +281,15 @@ #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +/* + * TTBR. + */ +#ifdef CONFIG_ARM64_PA_BITS_52 +/* + * This should be GENMASK_ULL(47, 2). + * TTBR_ELx[1] is RES0 in this configuration. + */ +#define TTBR_BADDR_MASK_52 (((UL(1) << 46) - 1) << 2) +#endif + #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 149d05fb9421..93677b9db947 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -733,6 +733,12 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define kc_vaddr_to_offset(v) ((v) & ~VA_START) #define kc_offset_to_vaddr(o) ((o) | VA_START) +#ifdef CONFIG_ARM64_PA_BITS_52 +#define phys_to_ttbr(addr) (((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52) +#else +#define phys_to_ttbr(addr) (addr) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 67e86a0f57ac..0addea3760a6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -679,8 +679,10 @@ ENTRY(__enable_mmu) update_early_cpu_boot_status 0, x1, x2 adrp x1, idmap_pg_dir adrp x2, swapper_pg_dir - msr ttbr0_el1, x1 // load TTBR0 - msr ttbr1_el1, x2 // load TTBR1 + phys_to_ttbr x1, x3 + phys_to_ttbr x2, x4 + msr ttbr0_el1, x3 // load TTBR0 + msr ttbr1_el1, x4 // load TTBR1 isb msr sctlr_el1, x0 isb diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index e56d848b6466..84f5d52fddda 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -33,12 +33,14 @@ * Even switching to our copied tables will cause a changed output address at * each stage of the walk. */ -.macro break_before_make_ttbr_switch zero_page, page_table - msr ttbr1_el1, \zero_page +.macro break_before_make_ttbr_switch zero_page, page_table, tmp + phys_to_ttbr \zero_page, \tmp + msr ttbr1_el1, \tmp isb tlbi vmalle1 dsb nsh - msr ttbr1_el1, \page_table + phys_to_ttbr \page_table, \tmp + msr ttbr1_el1, \tmp isb .endm @@ -78,7 +80,7 @@ ENTRY(swsusp_arch_suspend_exit) * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page */ - break_before_make_ttbr_switch x5, x0 + break_before_make_ttbr_switch x5, x0, x6 mov x21, x1 mov x30, x2 @@ -109,7 +111,7 @@ ENTRY(swsusp_arch_suspend_exit) dsb ish /* wait for PoU cleaning to finish */ /* switch to the restored kernels page tables */ - break_before_make_ttbr_switch x25, x21 + break_before_make_ttbr_switch x25, x21, x6 ic ialluis dsb ish diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 3009b8b80f08..efbf6dbd93c8 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -264,7 +264,7 @@ static int create_safe_exec_page(void *src_start, size_t length, */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(virt_to_phys(pgd), ttbr0_el1); + write_sysreg(phys_to_ttbr(virt_to_phys(pgd)), ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys((void *)dst); diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index e2d1fe03662a..f9681cc00973 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -63,7 +63,8 @@ __do_hyp_init: cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - msr ttbr0_el2, x0 + phys_to_ttbr x0, x4 + msr ttbr0_el2, x4 mrs x4, tcr_el1 ldr x5, =TCR_EL2_MASK diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 051e71ec3335..289f9113a27a 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -49,6 +49,14 @@ void __init pgd_cache_init(void) if (PGD_SIZE == PAGE_SIZE) return; +#ifdef CONFIG_ARM64_PA_BITS_52 + /* + * With 52-bit physical addresses, the architecture requires the + * top-level table to be aligned to at least 64 bytes. + */ + BUILD_BUG_ON(PGD_SIZE < 64); +#endif + /* * Naturally aligned pgds required by the architecture. */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 4f133cb340dc..e79db5a7576a 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -138,10 +138,11 @@ ENDPROC(cpu_do_resume) * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) - pre_ttbr0_update_workaround x0, x2, x3 + phys_to_ttbr x0, x2 + pre_ttbr0_update_workaround x2, x3, x4 mmid x1, x1 // get mm->context.id - bfi x0, x1, #48, #16 // set the ASID - msr ttbr0_el1, x0 // set TTBR0 + bfi x2, x1, #48, #16 // set the ASID + msr ttbr0_el1, x2 // set TTBR0 isb post_ttbr0_update_workaround ret @@ -158,14 +159,16 @@ ENTRY(idmap_cpu_replace_ttbr1) save_and_disable_daif flags=x2 adrp x1, empty_zero_page - msr ttbr1_el1, x1 + phys_to_ttbr x1, x3 + msr ttbr1_el1, x3 isb tlbi vmalle1 dsb nsh isb - msr ttbr1_el1, x0 + phys_to_ttbr x0, x3 + msr ttbr1_el1, x3 isb restore_daif x2 diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 6b60c98a6e22..c8d49879307f 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -509,7 +509,7 @@ static void update_vttbr(struct kvm *kvm) pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = pgd_phys | vmid; + kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; spin_unlock(&kvm_vmid_lock); } From e6d588a8e3da24ea51321279a064b97feb502ef0 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:19 +0000 Subject: [PATCH 024/104] arm64: head.S: handle 52-bit PAs in PTEs in early page table setup The top 4 bits of a 52-bit physical address are positioned at bits 12..15 in page table entries. Introduce a macro to move the bits there, and change the early ID map and swapper table setup code to use it. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: additional comments for clarification] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 6 ++++ arch/arm64/kernel/head.S | 40 ++++++++++++++++++++------ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index f92be11a209a..5513ccd687f4 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -168,6 +168,12 @@ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ #define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ +#ifdef CONFIG_ARM64_PA_BITS_52 +#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) +#define PTE_ADDR_HIGH (_AT(pteval_t, 0xf) << 12) +#define PTE_ADDR_MASK_52 (PTE_ADDR_LOW | PTE_ADDR_HIGH) +#endif + /* * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers). */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 0addea3760a6..bb06223691ba 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -147,6 +147,26 @@ preserve_boot_args: b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) +/* + * Macro to arrange a physical address in a page table entry, taking care of + * 52-bit addresses. + * + * Preserves: phys + * Returns: pte + */ + .macro phys_to_pte, phys, pte +#ifdef CONFIG_ARM64_PA_BITS_52 + /* + * We assume \phys is 64K aligned and this is guaranteed by only + * supporting this configuration with 64K pages. + */ + orr \pte, \phys, \phys, lsr #36 + and \pte, \pte, #PTE_ADDR_MASK_52 +#else + mov \pte, \phys +#endif + .endm + /* * Macro to create a table entry to the next page. * @@ -160,10 +180,11 @@ ENDPROC(preserve_boot_args) * Returns: tbl -> next level table page address */ .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 + add \tmp1, \tbl, #PAGE_SIZE + phys_to_pte \tmp1, \tmp2 + orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type lsr \tmp1, \virt, #\shift and \tmp1, \tmp1, #\ptrs - 1 // table index - add \tmp2, \tbl, #PAGE_SIZE - orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type str \tmp2, [\tbl, \tmp1, lsl #3] add \tbl, \tbl, #PAGE_SIZE // next level table page .endm @@ -190,16 +211,17 @@ ENDPROC(preserve_boot_args) * virtual range (inclusive). * * Preserves: tbl, flags - * Corrupts: phys, start, end, pstate + * Corrupts: phys, start, end, tmp, pstate */ - .macro create_block_map, tbl, flags, phys, start, end - lsr \phys, \phys, #SWAPPER_BLOCK_SHIFT + .macro create_block_map, tbl, flags, phys, start, end, tmp lsr \start, \start, #SWAPPER_BLOCK_SHIFT and \start, \start, #PTRS_PER_PTE - 1 // table index - orr \phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT // table entry + bic \phys, \phys, #SWAPPER_BLOCK_SIZE - 1 lsr \end, \end, #SWAPPER_BLOCK_SHIFT and \end, \end, #PTRS_PER_PTE - 1 // table end index -9999: str \phys, [\tbl, \start, lsl #3] // store the entry +9999: phys_to_pte \phys, \tmp + orr \tmp, \tmp, \flags // table entry + str \tmp, [\tbl, \start, lsl #3] // store the entry add \start, \start, #1 // next entry add \phys, \phys, #SWAPPER_BLOCK_SIZE // next block cmp \start, \end @@ -286,7 +308,7 @@ __create_page_tables: create_pgd_entry x0, x3, x5, x6 mov x5, x3 // __pa(__idmap_text_start) adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - create_block_map x0, x7, x3, x5, x6 + create_block_map x0, x7, x3, x5, x6, x4 /* * Map the kernel image (starting with PHYS_OFFSET). @@ -299,7 +321,7 @@ __create_page_tables: adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - create_block_map x0, x7, x3, x5, x6 + create_block_map x0, x7, x3, x5, x6, x4 /* * Since the page tables have been populated with non-cacheable From 193383043f14a398393dc18bae8380f7fe665ec3 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:20 +0000 Subject: [PATCH 025/104] arm64: don't open code page table entry creation Instead of open coding the generation of page table entries, use the macros/functions that exist for this - pfn_p*d and p*d_populate. Most code in the kernel already uses these macros, this patch tries to fix up the few places that don't. This is useful for the next patch in this series, which needs to change the page table entry logic, and it's better to have that logic in one place. The KVM extended ID map is special, since we're creating a level above CONFIG_PGTABLE_LEVELS and the required function isn't available. Leave it as is and add a comment to explain it. (The normal kernel ID map code doesn't need this change because its page tables are created in assembly (__create_page_tables)). Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_mmu.h | 5 +++++ arch/arm64/include/asm/pgtable.h | 1 + arch/arm64/kernel/hibernate.c | 3 +-- arch/arm64/mm/mmu.c | 14 +++++++++----- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 747bfff92948..9810ebf949b3 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -276,6 +276,11 @@ static inline bool __kvm_cpu_uses_extended_idmap(void) return __cpu_uses_extended_idmap(); } +/* + * Can't use pgd_populate here, because the extended idmap adds an extra level + * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended + * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4. + */ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, pgd_t *hyp_pgd, pgd_t *merged_hyp_pgd, diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 93677b9db947..5d9554fb2692 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -355,6 +355,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pud_write(pud) pte_write(pud_pte(pud)) #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) +#define pfn_pud(pfn,prot) (__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index efbf6dbd93c8..f20cf7e99249 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -247,8 +247,7 @@ static int create_safe_exec_page(void *src_start, size_t length, } pte = pte_offset_kernel(pmd, dst_addr); - set_pte(pte, __pte(virt_to_phys((void *)dst) | - pgprot_val(PAGE_KERNEL_EXEC))); + set_pte(pte, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); /* * Load our new page tables. A strict BBM approach requires that we diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 267d2b79d52d..0c631a17ae1d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -570,8 +570,8 @@ static void __init map_kernel(pgd_t *pgd) * entry instead. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), - __pud(__pa_symbol(bm_pmd) | PUD_TYPE_TABLE)); + pud_populate(&init_mm, pud_set_fixmap_offset(pgd, FIXADDR_START), + lm_alias(bm_pmd)); pud_clear_fixmap(); } else { BUG(); @@ -686,7 +686,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) if (!p) return -ENOMEM; - set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL)); + pmd_set_huge(pmd, __pa(p), __pgprot(PROT_SECT_NORMAL)); } else vmemmap_verify((pte_t *)pmd, node, addr, next); } while (addr = next, addr != end); @@ -879,15 +879,19 @@ int __init arch_ioremap_pmd_supported(void) int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) { + pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT | + pgprot_val(mk_sect_prot(prot))); BUG_ON(phys & ~PUD_MASK); - set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + set_pud(pud, pfn_pud(__phys_to_pfn(phys), sect_prot)); return 1; } int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) { + pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT | + pgprot_val(mk_sect_prot(prot))); BUG_ON(phys & ~PMD_MASK); - set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + set_pmd(pmd, pfn_pmd(__phys_to_pfn(phys), sect_prot)); return 1; } From 75387b92635e7dca410c1ef92cfe510019248f76 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:21 +0000 Subject: [PATCH 026/104] arm64: handle 52-bit physical addresses in page table entries The top 4 bits of a 52-bit physical address are positioned at bits 12..15 of a page table entry. Introduce macros to convert between a physical address and its placement in a table entry, and change all macros/functions that access PTEs to use them. Reviewed-by: Marc Zyngier Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: some long lines wrapped] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_mmu.h | 7 ++-- arch/arm64/include/asm/pgalloc.h | 6 ++-- arch/arm64/include/asm/pgtable-hwdef.h | 6 ++-- arch/arm64/include/asm/pgtable.h | 50 +++++++++++++++++++------- arch/arm64/kernel/head.S | 2 +- 5 files changed, 51 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 9810ebf949b3..b3f7b68b042d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -287,6 +287,7 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, unsigned long hyp_idmap_start) { int idmap_idx; + u64 pgd_addr; /* * Use the first entry to access the HYP mappings. It is @@ -294,7 +295,8 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, * extended idmap. */ VM_BUG_ON(pgd_val(merged_hyp_pgd[0])); - merged_hyp_pgd[0] = __pgd(__pa(hyp_pgd) | PMD_TYPE_TABLE); + pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd)); + merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE); /* * Create another extended level entry that points to the boot HYP map, @@ -304,7 +306,8 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, */ idmap_idx = hyp_idmap_start >> VA_BITS; VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx])); - merged_hyp_pgd[idmap_idx] = __pgd(__pa(boot_hyp_pgd) | PMD_TYPE_TABLE); + pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd)); + merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE); } static inline unsigned int kvm_get_vmid_bits(void) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 5ca6a573a701..e9d9f1b006ef 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -44,7 +44,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) { - set_pud(pud, __pud(pmd | prot)); + set_pud(pud, __pud(__phys_to_pud_val(pmd) | prot)); } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) @@ -73,7 +73,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) { - set_pgd(pgdp, __pgd(pud | prot)); + set_pgd(pgdp, __pgd(__phys_to_pgd_val(pud) | prot)); } static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) @@ -129,7 +129,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte, pmdval_t prot) { - set_pmd(pmdp, __pmd(pte | prot)); + set_pmd(pmdp, __pmd(__phys_to_pmd_val(pte) | prot)); } /* diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5513ccd687f4..85069f37ae37 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -168,10 +168,12 @@ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ #define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ -#ifdef CONFIG_ARM64_PA_BITS_52 #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) +#ifdef CONFIG_ARM64_PA_BITS_52 #define PTE_ADDR_HIGH (_AT(pteval_t, 0xf) << 12) -#define PTE_ADDR_MASK_52 (PTE_ADDR_LOW | PTE_ADDR_HIGH) +#define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) +#else +#define PTE_ADDR_MASK PTE_ADDR_LOW #endif /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5d9554fb2692..4fd8af303e2c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -57,9 +57,22 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) -#define pte_pfn(pte) ((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT) +/* + * Macros to convert between a physical address and its placement in a + * page table entry, taking care of 52-bit addresses. + */ +#ifdef CONFIG_ARM64_PA_BITS_52 +#define __pte_to_phys(pte) \ + ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36)) +#define __phys_to_pte_val(phys) (((phys) | ((phys) >> 36)) & PTE_ADDR_MASK) +#else +#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) +#define __phys_to_pte_val(phys) (phys) +#endif -#define pfn_pte(pfn,prot) (__pte(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) +#define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) +#define pfn_pte(pfn,prot) \ + __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) #define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) @@ -284,6 +297,11 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define __HAVE_ARCH_PTE_SPECIAL +static inline pte_t pgd_pte(pgd_t pgd) +{ + return __pte(pgd_val(pgd)); +} + static inline pte_t pud_pte(pud_t pud) { return __pte(pud_val(pud)); @@ -349,16 +367,24 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) -#define pmd_pfn(pmd) (((pmd_val(pmd) & PMD_MASK) & PHYS_MASK) >> PAGE_SHIFT) -#define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) +#define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) +#define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) +#define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) +#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) #define pud_write(pud) pte_write(pud_pte(pud)) -#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) -#define pfn_pud(pfn,prot) (__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) + +#define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) +#define __phys_to_pud_val(phys) __phys_to_pte_val(phys) +#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) +#define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) +#define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) +#define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) + #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) @@ -409,7 +435,7 @@ static inline void pmd_clear(pmd_t *pmdp) static inline phys_addr_t pmd_page_paddr(pmd_t pmd) { - return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; + return __pmd_to_phys(pmd); } /* Find an entry in the third-level page table. */ @@ -427,7 +453,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) #define pte_clear_fixmap() clear_fixmap(FIX_PTE) -#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) +#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd))) /* use ONLY for statically allocated translation tables */ #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) @@ -460,7 +486,7 @@ static inline void pud_clear(pud_t *pudp) static inline phys_addr_t pud_page_paddr(pud_t pud) { - return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; + return __pud_to_phys(pud); } /* Find an entry in the second-level page table. */ @@ -473,7 +499,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) #define pmd_clear_fixmap() clear_fixmap(FIX_PMD) -#define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +#define pud_page(pud) pfn_to_page(__phys_to_pfn(__pud_to_phys(pud))) /* use ONLY for statically allocated translation tables */ #define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) @@ -512,7 +538,7 @@ static inline void pgd_clear(pgd_t *pgdp) static inline phys_addr_t pgd_page_paddr(pgd_t pgd) { - return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; + return __pgd_to_phys(pgd); } /* Find an entry in the frst-level page table. */ @@ -525,7 +551,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index bb06223691ba..eeec0001e204 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -161,7 +161,7 @@ ENDPROC(preserve_boot_args) * supporting this configuration with 64K pages. */ orr \pte, \phys, \phys, lsr #36 - and \pte, \pte, #PTE_ADDR_MASK_52 + and \pte, \pte, #PTE_ADDR_MASK #else mov \pte, \phys #endif From fa2a8445b1d3810c52f2a6b3a006456bd1aacb7e Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:24 +0000 Subject: [PATCH 027/104] arm64: allow ID map to be extended to 52 bits Currently, when using VA_BITS < 48, if the ID map text happens to be placed in physical memory above VA_BITS, we increase the VA size (up to 48) and create a new table level, in order to map in the ID map text. This is okay because the system always supports 48 bits of VA. This patch extends the code such that if the system supports 52 bits of VA, and the ID map text is placed that high up, then we increase the VA size accordingly, up to 52. One difference from the current implementation is that so far the condition of VA_BITS < 48 has meant that the top level table is always "full", with the maximum number of entries, and an extra table level is always needed. Now, when VA_BITS = 48 (and using 64k pages), the top level table is not full, and we simply need to increase the number of entries in it, instead of creating a new table level. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Reviewed-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko [catalin.marinas@arm.com: reduce arguments to __create_hyp_mappings()] [catalin.marinas@arm.com: reworked/renamed __cpu_uses_extended_idmap_level()] Signed-off-by: Catalin Marinas --- arch/arm/include/asm/kvm_mmu.h | 5 ++ arch/arm64/include/asm/assembler.h | 2 - arch/arm64/include/asm/kvm_mmu.h | 7 ++- arch/arm64/include/asm/mmu_context.h | 10 ++++ arch/arm64/kernel/head.S | 80 ++++++++++++++++------------ arch/arm64/kvm/hyp-init.S | 17 +++--- arch/arm64/mm/mmu.c | 1 + virt/kvm/arm/mmu.c | 10 +++- 8 files changed, 85 insertions(+), 47 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 8dbec683638b..8c5643e2eea4 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -211,6 +211,11 @@ static inline bool __kvm_cpu_uses_extended_idmap(void) return false; } +static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) +{ + return PTRS_PER_PGD; +} + static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, pgd_t *hyp_pgd, pgd_t *merged_hyp_pgd, diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 49ea3def4bd1..942fdb5ef0ad 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -344,10 +344,8 @@ alternative_endif * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map */ .macro tcr_set_idmap_t0sz, valreg, tmpreg -#ifndef CONFIG_ARM64_VA_BITS_48 ldr_l \tmpreg, idmap_t0sz bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH -#endif .endm /* diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b3f7b68b042d..b33bdb5eeb3d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -273,7 +273,12 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); static inline bool __kvm_cpu_uses_extended_idmap(void) { - return __cpu_uses_extended_idmap(); + return __cpu_uses_extended_idmap_level(); +} + +static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) +{ + return idmap_ptrs_per_pgd; } /* diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index accc2ff32a0e..88451d47036b 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -63,6 +63,7 @@ static inline void cpu_set_reserved_ttbr0(void) * physical memory, in which case it will be smaller. */ extern u64 idmap_t0sz; +extern u64 idmap_ptrs_per_pgd; static inline bool __cpu_uses_extended_idmap(void) { @@ -70,6 +71,15 @@ static inline bool __cpu_uses_extended_idmap(void) unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS))); } +/* + * True if the extended ID map requires an extra level of translation table + * to be configured. + */ +static inline bool __cpu_uses_extended_idmap_level(void) +{ + return ARM64_HW_PGTABLE_LEVELS((64 - idmap_t0sz)) > CONFIG_PGTABLE_LEVELS; +} + /* * Set TCR.T0SZ to its default value (based on VA_BITS) */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index eeec0001e204..66f01869e97c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -176,7 +176,7 @@ ENDPROC(preserve_boot_args) * ptrs: #imm pointers per table page * * Preserves: virt - * Corrupts: tmp1, tmp2 + * Corrupts: ptrs, tmp1, tmp2 * Returns: tbl -> next level table page address */ .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 @@ -184,7 +184,8 @@ ENDPROC(preserve_boot_args) phys_to_pte \tmp1, \tmp2 orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type lsr \tmp1, \virt, #\shift - and \tmp1, \tmp1, #\ptrs - 1 // table index + sub \ptrs, \ptrs, #1 + and \tmp1, \tmp1, \ptrs // table index str \tmp2, [\tbl, \tmp1, lsl #3] add \tbl, \tbl, #PAGE_SIZE // next level table page .endm @@ -194,15 +195,17 @@ ENDPROC(preserve_boot_args) * block entry in the next level (tbl) for the given virtual address. * * Preserves: tbl, next, virt - * Corrupts: tmp1, tmp2 + * Corrupts: ptrs_per_pgd, tmp1, tmp2 */ - .macro create_pgd_entry, tbl, virt, tmp1, tmp2 - create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2 + .macro create_pgd_entry, tbl, virt, ptrs_per_pgd, tmp1, tmp2 + create_table_entry \tbl, \virt, PGDIR_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 #if SWAPPER_PGTABLE_LEVELS > 3 - create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2 + mov \ptrs_per_pgd, PTRS_PER_PUD + create_table_entry \tbl, \virt, PUD_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 #endif #if SWAPPER_PGTABLE_LEVELS > 2 - create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2 + mov \ptrs_per_pgd, PTRS_PER_PTE + create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 #endif .endm @@ -266,16 +269,34 @@ __create_page_tables: adrp x0, idmap_pg_dir adrp x3, __idmap_text_start // __pa(__idmap_text_start) -#ifndef CONFIG_ARM64_VA_BITS_48 + /* + * VA_BITS may be too small to allow for an ID mapping to be created + * that covers system RAM if that is located sufficiently high in the + * physical address space. So for the ID map, use an extended virtual + * range in that case, and configure an additional translation level + * if needed. + * + * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the + * entire ID map region can be mapped. As T0SZ == (64 - #bits used), + * this number conveniently equals the number of leading zeroes in + * the physical address of __idmap_text_end. + */ + adrp x5, __idmap_text_end + clz x5, x5 + cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? + b.ge 1f // .. then skip VA range extension + + adr_l x6, idmap_t0sz + str x5, [x6] + dmb sy + dc ivac, x6 // Invalidate potentially stale cache line + +#if (VA_BITS < 48) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (48 - EXTRA_SHIFT)) +#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) /* - * If VA_BITS < 48, it may be too small to allow for an ID mapping to be - * created that covers system RAM if that is located sufficiently high - * in the physical address space. So for the ID map, use an extended - * virtual range in that case, by configuring an additional translation - * level. + * If VA_BITS < 48, we have to configure an additional table level. * First, we have to verify our assumption that the current value of * VA_BITS was chosen such that all translation levels are fully * utilised, and that lowering T0SZ will always result in an additional @@ -285,27 +306,19 @@ __create_page_tables: #error "Mismatch between VA_BITS and page size/number of translation levels" #endif + mov x4, EXTRA_PTRS + create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 +#else /* - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of __idmap_text_end. + * If VA_BITS == 48, we don't have to configure an additional + * translation level, but the top-level table has more entries. */ - adrp x5, __idmap_text_end - clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? - b.ge 1f // .. then skip additional level - - adr_l x6, idmap_t0sz - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - - create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6 -1: + mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) + str_l x4, idmap_ptrs_per_pgd, x5 #endif - - create_pgd_entry x0, x3, x5, x6 +1: + ldr_l x4, idmap_ptrs_per_pgd + create_pgd_entry x0, x3, x4, x5, x6 mov x5, x3 // __pa(__idmap_text_start) adr_l x6, __idmap_text_end // __pa(__idmap_text_end) create_block_map x0, x7, x3, x5, x6, x4 @@ -316,7 +329,8 @@ __create_page_tables: adrp x0, swapper_pg_dir mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) add x5, x5, x23 // add KASLR displacement - create_pgd_entry x0, x5, x3, x6 + mov x4, PTRS_PER_PGD + create_pgd_entry x0, x5, x4, x3, x6 adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index f9681cc00973..33c40b3eea01 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -72,24 +72,23 @@ __do_hyp_init: mov x5, #TCR_EL2_RES1 orr x4, x4, x5 -#ifndef CONFIG_ARM64_VA_BITS_48 /* - * If we are running with VA_BITS < 48, we may be running with an extra - * level of translation in the ID map. This is only the case if system - * RAM is out of range for the currently configured page size and number - * of translation levels, in which case we will also need the extra - * level for the HYP ID map, or we won't be able to enable the EL2 MMU. + * The ID map may be configured to use an extended virtual address + * range. This is only the case if system RAM is out of range for the + * currently configured page size and VA_BITS, in which case we will + * also need the extended virtual range for the HYP ID map, or we won't + * be able to enable the EL2 MMU. * * However, at EL2, there is only one TTBR register, and we can't switch * between translation tables *and* update TCR_EL2.T0SZ at the same - * time. Bottom line: we need the extra level in *both* our translation - * tables. + * time. Bottom line: we need to use the extended range with *both* our + * translation tables. * * So use the same T0SZ value we use for the ID map. */ ldr_l x5, idmap_t0sz bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH -#endif + /* * Set the PS bits in TCR_EL2. */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0c631a17ae1d..baa34418c3bf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -50,6 +50,7 @@ #define NO_CONT_MAPPINGS BIT(1) u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index b36945d49986..761787befd3b 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -629,14 +629,20 @@ static int __create_hyp_mappings(pgd_t *pgdp, { pgd_t *pgd; pud_t *pud; - unsigned long addr, next; + unsigned long addr, next, ptrs_per_pgd = PTRS_PER_PGD; int err = 0; + /* + * If it's not the hyp_pgd, fall back to the kvm idmap layout. + */ + if (pgdp != hyp_pgd) + ptrs_per_pgd = __kvm_idmap_ptrs_per_pgd(); + mutex_lock(&kvm_hyp_pgd_mutex); addr = start & PAGE_MASK; end = PAGE_ALIGN(end); do { - pgd = pgdp + pgd_index(addr); + pgd = pgdp + ((addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1)); if (pgd_none(*pgd)) { pud = pud_alloc_one(NULL, addr); From f77d281713d4188973bb34ecb10e51ae39ce6946 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 13 Dec 2017 17:07:25 +0000 Subject: [PATCH 028/104] arm64: enable 52-bit physical address support Now that 52-bit physical address support is in place, add the kconfig symbol to enable it. As described in ARMv8.2, the larger addresses are only supported with the 64k granule. Also ensure that PAN is configured (or TTBR0 PAN is not), as explained in an earlier patch in this series. Tested-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Acked-by: Marc Zyngier Tested-by: Bob Picco Reviewed-by: Bob Picco Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8dc937823eeb..337a696c9b02 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -656,11 +656,24 @@ choice config ARM64_PA_BITS_48 bool "48-bit" +config ARM64_PA_BITS_52 + bool "52-bit (ARMv8.2)" + depends on ARM64_64K_PAGES + depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN + help + Enable support for a 52-bit physical address space, introduced as + part of the ARMv8.2-LPA extension. + + With this enabled, the kernel will also continue to work on CPUs that + do not support ARMv8.2-LPA, but with some added memory overhead (and + minor performance overhead). + endchoice config ARM64_PA_BITS int default 48 if ARM64_PA_BITS_48 + default 52 if ARM64_PA_BITS_52 config CPU_BIG_ENDIAN bool "Build big-endian kernel" From db50a74d8193944dd1ee488fd2a813a364fbbaa7 Mon Sep 17 00:00:00 2001 From: Prashanth Prakash Date: Wed, 15 Nov 2017 10:11:49 -0700 Subject: [PATCH 029/104] cpuidle: Add new macro to enter a retention idle state If a CPU is entering a low power idle state where it doesn't lose any context, then there is no need to call cpu_pm_enter()/cpu_pm_exit(). Add a new macro(CPU_PM_CPU_IDLE_ENTER_RETENTION) to be used by cpuidle drivers when they are entering retention state. By not calling cpu_pm_enter and cpu_pm_exit we reduce the latency involved in entering and exiting the retention idle states. CPU_PM_CPU_IDLE_ENTER_RETENTION assumes that no state is lost and hence CPU PM notifiers will not be called. We may need a broader change if we need to support partial retention states effeciently. On ARM64 based Qualcomm Server Platform we measured below overhead for for calling cpu_pm_enter and cpu_pm_exit for retention states. workload: stress --hdd #CPUs --hdd-bytes 32M -t 30 Average overhead of cpu_pm_enter - 1.2us Average overhead of cpu_pm_exit - 3.1us Acked-by: Rafael J. Wysocki Acked-by: Sudeep Holla Signed-off-by: Prashanth Prakash Signed-off-by: Catalin Marinas --- include/linux/cpuidle.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8f7788d23b57..871f9e21810c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -257,22 +257,30 @@ static inline int cpuidle_register_governor(struct cpuidle_governor *gov) {return 0;} #endif -#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \ -({ \ - int __ret; \ - \ - if (!idx) { \ - cpu_do_idle(); \ - return idx; \ - } \ - \ - __ret = cpu_pm_enter(); \ - if (!__ret) { \ - __ret = low_level_idle_enter(idx); \ - cpu_pm_exit(); \ - } \ - \ - __ret ? -1 : idx; \ +#define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, is_retention) \ +({ \ + int __ret = 0; \ + \ + if (!idx) { \ + cpu_do_idle(); \ + return idx; \ + } \ + \ + if (!is_retention) \ + __ret = cpu_pm_enter(); \ + if (!__ret) { \ + __ret = low_level_idle_enter(idx); \ + if (!is_retention) \ + cpu_pm_exit(); \ + } \ + \ + __ret ? -1 : idx; \ }) +#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, 0) + +#define CPU_PM_CPU_IDLE_ENTER_RETENTION(low_level_idle_enter, idx) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, 1) + #endif /* _LINUX_CPUIDLE_H */ From 8b9951ed7e5b517bdd5743d71ac662885d3c7bfc Mon Sep 17 00:00:00 2001 From: Prashanth Prakash Date: Wed, 15 Nov 2017 10:11:50 -0700 Subject: [PATCH 030/104] ARM64 / cpuidle: Use new cpuidle macro for entering retention state CPU_PM_CPU_IDLE_ENTER_RETENTION skips calling cpu_pm_enter() and cpu_pm_exit(). By not calling cpu_pm functions in idle entry/exit paths we can reduce the latency involved in entering and exiting the low power idle state. On ARM64 based Qualcomm server platform we measured below overhead for calling cpu_pm_enter and cpu_pm_exit for retention states. workload: stress --hdd #CPUs --hdd-bytes 32M -t 30 Average overhead of cpu_pm_enter - 1.2us Average overhead of cpu_pm_exit - 3.1us Acked-by: Sudeep Holla Signed-off-by: Prashanth Prakash Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpuidle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index fd691087dc9a..f2d13810daa8 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -47,6 +47,8 @@ int arm_cpuidle_suspend(int index) #include +#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) + int acpi_processor_ffh_lpi_probe(unsigned int cpu) { return arm_cpuidle_init(cpu); @@ -54,6 +56,10 @@ int acpi_processor_ffh_lpi_probe(unsigned int cpu) int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { - return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); + if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) + return CPU_PM_CPU_IDLE_ENTER_RETENTION(arm_cpuidle_suspend, + lpi->index); + else + return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); } #endif From f5ed22e2647cc89dd0a57936b38a582546527509 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 7 Nov 2017 11:24:04 +0900 Subject: [PATCH 031/104] arm64: make label allocation style consistent in tishift This is entirely cosmetic, but somehow it was missed when sending differing versions of this patch. This just makes the file a bit more uniform. Signed-off-by: Jason A. Donenfeld Signed-off-by: Catalin Marinas --- arch/arm64/lib/tishift.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S index 0179a43cc045..d3db9b2cd479 100644 --- a/arch/arm64/lib/tishift.S +++ b/arch/arm64/lib/tishift.S @@ -38,19 +38,19 @@ ENTRY(__ashlti3) ENDPROC(__ashlti3) ENTRY(__ashrti3) - cbz x2, 3f + cbz x2, 1f mov x3, #64 sub x3, x3, x2 cmp x3, #0 - b.le 4f + b.le 2f lsr x0, x0, x2 lsl x3, x1, x3 asr x2, x1, x2 orr x0, x0, x3 mov x1, x2 -3: +1: ret -4: +2: neg w0, w3 asr x2, x1, #63 asr x0, x1, x0 From 82975c46da8275a2a212cd94049bbef9bb961da2 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:26 +0000 Subject: [PATCH 032/104] perf: Export perf_event_update_userpage Export perf_event_update_userpage() so that PMU driver using them, can be built as modules. Acked-by: Peter Zilstra Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..64ec2c9d2c44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4904,6 +4904,7 @@ void perf_event_update_userpage(struct perf_event *event) unlock: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(perf_event_update_userpage); static int perf_mmap_fault(struct vm_fault *vmf) { From a0e71cd9b1a0195f96500b2268759873ff8ff819 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:27 +0000 Subject: [PATCH 033/104] of: Add helper for mapping device node to logical CPU number Add a helper to map a device node to a logical CPU number to avoid duplication. Currently this is open coded in different places (e.g gic-v3, coresight). The helper tries to map device node to a "possible" logical CPU id, which may not be online yet. It is the responsibility of the user to make sure that the CPU is online. The helper uses of_cpu_device_node_get() to retrieve the device node for a given CPU (which uses per_cpu data if available else falls back to slower of_get_cpu_node()). Cc: devicetree@vger.kernel.org Cc: Frank Rowand Cc: Mark Rutland Cc: Sudeep Holla Reviewed-by: Marc Zyngier Reviewed-by: Rob Herring Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- drivers/of/base.c | 26 ++++++++++++++++++++++++++ include/linux/of.h | 7 +++++++ 2 files changed, 33 insertions(+) diff --git a/drivers/of/base.c b/drivers/of/base.c index 26618ba8f92a..a9d6fe86585b 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -315,6 +315,32 @@ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread) } EXPORT_SYMBOL(of_get_cpu_node); +/** + * of_cpu_node_to_id: Get the logical CPU number for a given device_node + * + * @cpu_node: Pointer to the device_node for CPU. + * + * Returns the logical CPU number of the given CPU device_node. + * Returns -ENODEV if the CPU is not found. + */ +int of_cpu_node_to_id(struct device_node *cpu_node) +{ + int cpu; + bool found = false; + struct device_node *np; + + for_each_possible_cpu(cpu) { + np = of_cpu_device_node_get(cpu); + found = (cpu_node == np); + of_node_put(np); + if (found) + return cpu; + } + + return -ENODEV; +} +EXPORT_SYMBOL(of_cpu_node_to_id); + /** * __of_device_is_compatible() - Check if the node matches given constraints * @device: pointer to node diff --git a/include/linux/of.h b/include/linux/of.h index d3dea1d1e3a9..173102dafb07 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -544,6 +544,8 @@ const char *of_prop_next_string(struct property *prop, const char *cur); bool of_console_check(struct device_node *dn, char *name, int index); +extern int of_cpu_node_to_id(struct device_node *np); + #else /* CONFIG_OF */ static inline void of_core_init(void) @@ -916,6 +918,11 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag { } +static inline int of_cpu_node_to_id(struct device_node *np) +{ + return -ENODEV; +} + #define of_match_ptr(_ptr) NULL #define of_match_node(_matches, _node) NULL #endif /* CONFIG_OF */ From 29198e3844caaaacb0e21eff634c5f1cfd7f89be Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:28 +0000 Subject: [PATCH 034/104] coresight: of: Use of_cpu_node_to_id helper Reuse the new generic helper, of_cpu_node_to_id() to map a given CPU phandle to a logical CPU number. Acked-by: Mathieu Poirier Tested-by: Leo Yan Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- drivers/hwtracing/coresight/of_coresight.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c index a18794128bf8..7c375443ede6 100644 --- a/drivers/hwtracing/coresight/of_coresight.c +++ b/drivers/hwtracing/coresight/of_coresight.c @@ -104,26 +104,17 @@ static int of_coresight_alloc_memory(struct device *dev, int of_coresight_get_cpu(const struct device_node *node) { int cpu; - bool found; - struct device_node *dn, *np; + struct device_node *dn; dn = of_parse_phandle(node, "cpu", 0); - /* Affinity defaults to CPU0 */ if (!dn) return 0; - - for_each_possible_cpu(cpu) { - np = of_cpu_device_node_get(cpu); - found = (dn == np); - of_node_put(np); - if (found) - break; - } + cpu = of_cpu_node_to_id(dn); of_node_put(dn); /* Affinity to CPU0 if no cpu nodes are found */ - return found ? cpu : 0; + return (cpu < 0) ? 0 : cpu; } EXPORT_SYMBOL_GPL(of_coresight_get_cpu); From c08ec7da75f393c0dbaba9455f4f6b2a5e709355 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:29 +0000 Subject: [PATCH 035/104] irqchip: gic-v3: Use of_cpu_node_to_id helper Use the new generic helper of_cpu_node_to_id() instead of using our own version to map a device node to logical CPU number. Acked-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- drivers/irqchip/irq-gic-v3.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index b56c3e23f0af..9a7a15049903 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1070,31 +1070,6 @@ static int __init gic_validate_dist_version(void __iomem *dist_base) return 0; } -static int get_cpu_number(struct device_node *dn) -{ - const __be32 *cell; - u64 hwid; - int cpu; - - cell = of_get_property(dn, "reg", NULL); - if (!cell) - return -1; - - hwid = of_read_number(cell, of_n_addr_cells(dn)); - - /* - * Non affinity bits must be set to 0 in the DT - */ - if (hwid & ~MPIDR_HWID_BITMASK) - return -1; - - for_each_possible_cpu(cpu) - if (cpu_logical_map(cpu) == hwid) - return cpu; - - return -1; -} - /* Create all possible partitions at boot time */ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) { @@ -1145,8 +1120,8 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) if (WARN_ON(!cpu_node)) continue; - cpu = get_cpu_number(cpu_node); - if (WARN_ON(cpu == -1)) + cpu = of_cpu_node_to_id(cpu_node); + if (WARN_ON(cpu < 0)) continue; pr_cont("%pOF[%d] ", cpu_node, cpu); From 52cac1103af3283c7e4386b796da9cc92c3320e0 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:30 +0000 Subject: [PATCH 036/104] arm64: Use of_cpu_node_to_id helper for CPU topology parsing Make use of the new generic helper to convert an of_node of a CPU to the logical CPU id in parsing the topology. Cc: Catalin Marinas Cc: Leo Yan Cc: Will Deacon Acked-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 8d48b233e6ce..21868530018e 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -37,18 +37,14 @@ static int __init get_cpu_for_node(struct device_node *node) if (!cpu_node) return -1; - for_each_possible_cpu(cpu) { - if (of_get_cpu_node(cpu, NULL) == cpu_node) { - topology_parse_cpu_capacity(cpu_node, cpu); - of_node_put(cpu_node); - return cpu; - } - } - - pr_crit("Unable to find CPU node for %pOF\n", cpu_node); + cpu = of_cpu_node_to_id(cpu_node); + if (cpu >= 0) + topology_parse_cpu_capacity(cpu_node, cpu); + else + pr_crit("Unable to find CPU node for %pOF\n", cpu_node); of_node_put(cpu_node); - return -1; + return cpu; } static int __init parse_core(struct device_node *core, int cluster_id, From 66582787314ed0bd1dcac3f7a2b98ff71f3fb653 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:31 +0000 Subject: [PATCH 037/104] arm_pmu: Use of_cpu_node_to_id helper Use the new generic helper, of_cpu_node_to_id(), to map a a phandle to the logical CPU number while parsing the PMU irq affinity. Cc: Will Deacon Acked-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- drivers/perf/arm_pmu_platform.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c index 91b224eced18..46501cc79fd7 100644 --- a/drivers/perf/arm_pmu_platform.c +++ b/drivers/perf/arm_pmu_platform.c @@ -82,19 +82,10 @@ static int pmu_parse_irq_affinity(struct device_node *node, int i) return -EINVAL; } - /* Now look up the logical CPU number */ - for_each_possible_cpu(cpu) { - struct device_node *cpu_dn; - - cpu_dn = of_cpu_device_node_get(cpu); - of_node_put(cpu_dn); - - if (dn == cpu_dn) - break; - } - - if (cpu >= nr_cpu_ids) { + cpu = of_cpu_node_to_id(dn); + if (cpu < 0) { pr_warn("failed to find logical CPU for %s\n", dn->name); + cpu = nr_cpu_ids; } of_node_put(dn); From 9249dee611d6624bc9044fdf3877ace67d6143ab Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:32 +0000 Subject: [PATCH 038/104] dt-bindings: Document devicetree binding for ARM DSU PMU This patch documents the devicetree bindings for ARM DSU PMU. Cc: Mark Rutland Cc: Will Deacon Cc: devicetree@vger.kernel.org Cc: frowand.list@gmail.com Acked-by: Rob Herring Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- .../devicetree/bindings/arm/arm-dsu-pmu.txt | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt diff --git a/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt b/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt new file mode 100644 index 000000000000..6efabba530f1 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt @@ -0,0 +1,27 @@ +* ARM DynamIQ Shared Unit (DSU) Performance Monitor Unit (PMU) + +ARM DyanmIQ Shared Unit (DSU) integrates one or more CPU cores +with a shared L3 memory system, control logic and external interfaces to +form a multicore cluster. The PMU enables to gather various statistics on +the operations of the DSU. The PMU provides independent 32bit counters that +can count any of the supported events, along with a 64bit cycle counter. +The PMU is accessed via CPU system registers and has no MMIO component. + +** DSU PMU required properties: + +- compatible : should be one of : + + "arm,dsu-pmu" + +- interrupts : Exactly 1 SPI must be listed. + +- cpus : List of phandles for the CPUs connected to this DSU instance. + + +** Example: + +dsu-pmu-0 { + compatible = "arm,dsu-pmu"; + interrupts = ; + cpus = <&cpu_0>, <&cpu_1>; +}; From 7520fa99246dade7ab6dde1573a146beed632abd Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:33 +0000 Subject: [PATCH 039/104] perf: ARM DynamIQ Shared Unit PMU support Add support for the Cluster PMU part of the ARM DynamIQ Shared Unit (DSU). The DSU integrates one or more cores with an L3 memory system, control logic, and external interfaces to form a multicore cluster. The PMU allows counting the various events related to L3, SCU etc, along with providing a cycle counter. The PMU can be accessed via system registers, which are common to the cores in the same cluster. The PMU registers follow the semantics of the ARMv8 PMU, mostly, with the exception that the counters record the cluster wide events. This driver is mostly based on the ARMv8 and CCI PMU drivers. The driver only supports ARM64 at the moment. It can be extended to support ARM32 by providing register accessors like we do in arch/arm64/include/arm_dsu_pmu.h. Cc: Mark Rutland Cc: Will Deacon Reviewed-by: Jonathan Cameron Reviewed-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- Documentation/perf/arm_dsu_pmu.txt | 28 + arch/arm64/include/asm/arm_dsu_pmu.h | 129 ++++ drivers/perf/Kconfig | 9 + drivers/perf/Makefile | 1 + drivers/perf/arm_dsu_pmu.c | 843 +++++++++++++++++++++++++++ 5 files changed, 1010 insertions(+) create mode 100644 Documentation/perf/arm_dsu_pmu.txt create mode 100644 arch/arm64/include/asm/arm_dsu_pmu.h create mode 100644 drivers/perf/arm_dsu_pmu.c diff --git a/Documentation/perf/arm_dsu_pmu.txt b/Documentation/perf/arm_dsu_pmu.txt new file mode 100644 index 000000000000..d611e15f5add --- /dev/null +++ b/Documentation/perf/arm_dsu_pmu.txt @@ -0,0 +1,28 @@ +ARM DynamIQ Shared Unit (DSU) PMU +================================== + +ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system, +control logic and external interfaces to form a multicore cluster. The PMU +allows counting the various events related to the L3 cache, Snoop Control Unit +etc, using 32bit independent counters. It also provides a 64bit cycle counter. + +The PMU can only be accessed via CPU system registers and are common to the +cores connected to the same DSU. Like most of the other uncore PMUs, DSU +PMU doesn't support process specific events and cannot be used in sampling mode. + +The DSU provides a bitmap for a subset of implemented events via hardware +registers. There is no way for the driver to determine if the other events +are available or not. Hence the driver exposes only those events advertised +by the DSU, in "events" directory under : + + /sys/bus/event_sources/devices/arm_dsu_/ + +The user should refer to the TRM of the product to figure out the supported events +and use the raw event code for the unlisted events. + +The driver also exposes the CPUs connected to the DSU instance in "associated_cpus". + + +e.g usage : + + perf stat -a -e arm_dsu_0/cycles/ diff --git a/arch/arm64/include/asm/arm_dsu_pmu.h b/arch/arm64/include/asm/arm_dsu_pmu.h new file mode 100644 index 000000000000..82e5cc3356bf --- /dev/null +++ b/arch/arm64/include/asm/arm_dsu_pmu.h @@ -0,0 +1,129 @@ +/* + * ARM DynamIQ Shared Unit (DSU) PMU Low level register access routines. + * + * Copyright (C) ARM Limited, 2017. + * + * Author: Suzuki K Poulose + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2, as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + + +#define CLUSTERPMCR_EL1 sys_reg(3, 0, 15, 5, 0) +#define CLUSTERPMCNTENSET_EL1 sys_reg(3, 0, 15, 5, 1) +#define CLUSTERPMCNTENCLR_EL1 sys_reg(3, 0, 15, 5, 2) +#define CLUSTERPMOVSSET_EL1 sys_reg(3, 0, 15, 5, 3) +#define CLUSTERPMOVSCLR_EL1 sys_reg(3, 0, 15, 5, 4) +#define CLUSTERPMSELR_EL1 sys_reg(3, 0, 15, 5, 5) +#define CLUSTERPMINTENSET_EL1 sys_reg(3, 0, 15, 5, 6) +#define CLUSTERPMINTENCLR_EL1 sys_reg(3, 0, 15, 5, 7) +#define CLUSTERPMCCNTR_EL1 sys_reg(3, 0, 15, 6, 0) +#define CLUSTERPMXEVTYPER_EL1 sys_reg(3, 0, 15, 6, 1) +#define CLUSTERPMXEVCNTR_EL1 sys_reg(3, 0, 15, 6, 2) +#define CLUSTERPMMDCR_EL1 sys_reg(3, 0, 15, 6, 3) +#define CLUSTERPMCEID0_EL1 sys_reg(3, 0, 15, 6, 4) +#define CLUSTERPMCEID1_EL1 sys_reg(3, 0, 15, 6, 5) + +static inline u32 __dsu_pmu_read_pmcr(void) +{ + return read_sysreg_s(CLUSTERPMCR_EL1); +} + +static inline void __dsu_pmu_write_pmcr(u32 val) +{ + write_sysreg_s(val, CLUSTERPMCR_EL1); + isb(); +} + +static inline u32 __dsu_pmu_get_reset_overflow(void) +{ + u32 val = read_sysreg_s(CLUSTERPMOVSCLR_EL1); + /* Clear the bit */ + write_sysreg_s(val, CLUSTERPMOVSCLR_EL1); + isb(); + return val; +} + +static inline void __dsu_pmu_select_counter(int counter) +{ + write_sysreg_s(counter, CLUSTERPMSELR_EL1); + isb(); +} + +static inline u64 __dsu_pmu_read_counter(int counter) +{ + __dsu_pmu_select_counter(counter); + return read_sysreg_s(CLUSTERPMXEVCNTR_EL1); +} + +static inline void __dsu_pmu_write_counter(int counter, u64 val) +{ + __dsu_pmu_select_counter(counter); + write_sysreg_s(val, CLUSTERPMXEVCNTR_EL1); + isb(); +} + +static inline void __dsu_pmu_set_event(int counter, u32 event) +{ + __dsu_pmu_select_counter(counter); + write_sysreg_s(event, CLUSTERPMXEVTYPER_EL1); + isb(); +} + +static inline u64 __dsu_pmu_read_pmccntr(void) +{ + return read_sysreg_s(CLUSTERPMCCNTR_EL1); +} + +static inline void __dsu_pmu_write_pmccntr(u64 val) +{ + write_sysreg_s(val, CLUSTERPMCCNTR_EL1); + isb(); +} + +static inline void __dsu_pmu_disable_counter(int counter) +{ + write_sysreg_s(BIT(counter), CLUSTERPMCNTENCLR_EL1); + isb(); +} + +static inline void __dsu_pmu_enable_counter(int counter) +{ + write_sysreg_s(BIT(counter), CLUSTERPMCNTENSET_EL1); + isb(); +} + +static inline void __dsu_pmu_counter_interrupt_enable(int counter) +{ + write_sysreg_s(BIT(counter), CLUSTERPMINTENSET_EL1); + isb(); +} + +static inline void __dsu_pmu_counter_interrupt_disable(int counter) +{ + write_sysreg_s(BIT(counter), CLUSTERPMINTENCLR_EL1); + isb(); +} + + +static inline u32 __dsu_pmu_read_pmceid(int n) +{ + switch (n) { + case 0: + return read_sysreg_s(CLUSTERPMCEID0_EL1); + case 1: + return read_sysreg_s(CLUSTERPMCEID1_EL1); + default: + BUILD_BUG(); + return 0; + } +} diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index b8f44b068fc6..da5724cd89cf 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -17,6 +17,15 @@ config ARM_PMU_ACPI depends on ARM_PMU && ACPI def_bool y +config ARM_DSU_PMU + tristate "ARM DynamIQ Shared Unit (DSU) PMU" + depends on ARM64 + help + Provides support for performance monitor unit in ARM DynamIQ Shared + Unit (DSU). The DSU integrates one or more cores with an L3 memory + system, control logic. The PMU allows counting various events related + to DSU. + config HISI_PMU bool "HiSilicon SoC PMU" depends on ARM64 && ACPI diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 710a0135bd61..c2f27419bdf0 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o obj-$(CONFIG_HISI_PMU) += hisilicon/ diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c new file mode 100644 index 000000000000..37c0526c93d5 --- /dev/null +++ b/drivers/perf/arm_dsu_pmu.c @@ -0,0 +1,843 @@ +/* + * ARM DynamIQ Shared Unit (DSU) PMU driver + * + * Copyright (C) ARM Limited, 2017. + * + * Based on ARM CCI-PMU, ARMv8 PMU-v3 drivers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#define PMUNAME "arm_dsu" +#define DRVNAME PMUNAME "_pmu" +#define pr_fmt(fmt) DRVNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* PMU event codes */ +#define DSU_PMU_EVT_CYCLES 0x11 +#define DSU_PMU_EVT_CHAIN 0x1e + +#define DSU_PMU_MAX_COMMON_EVENTS 0x40 + +#define DSU_PMU_MAX_HW_CNTRS 32 +#define DSU_PMU_HW_COUNTER_MASK (DSU_PMU_MAX_HW_CNTRS - 1) + +#define CLUSTERPMCR_E BIT(0) +#define CLUSTERPMCR_P BIT(1) +#define CLUSTERPMCR_C BIT(2) +#define CLUSTERPMCR_N_SHIFT 11 +#define CLUSTERPMCR_N_MASK 0x1f +#define CLUSTERPMCR_IDCODE_SHIFT 16 +#define CLUSTERPMCR_IDCODE_MASK 0xff +#define CLUSTERPMCR_IMP_SHIFT 24 +#define CLUSTERPMCR_IMP_MASK 0xff +#define CLUSTERPMCR_RES_MASK 0x7e8 +#define CLUSTERPMCR_RES_VAL 0x40 + +#define DSU_ACTIVE_CPU_MASK 0x0 +#define DSU_ASSOCIATED_CPU_MASK 0x1 + +/* + * We use the index of the counters as they appear in the counter + * bit maps in the PMU registers (e.g CLUSTERPMSELR). + * i.e, + * counter 0 - Bit 0 + * counter 1 - Bit 1 + * ... + * Cycle counter - Bit 31 + */ +#define DSU_PMU_IDX_CYCLE_COUNTER 31 + +/* All event counters are 32bit, with a 64bit Cycle counter */ +#define DSU_PMU_COUNTER_WIDTH(idx) \ + (((idx) == DSU_PMU_IDX_CYCLE_COUNTER) ? 64 : 32) + +#define DSU_PMU_COUNTER_MASK(idx) \ + GENMASK_ULL((DSU_PMU_COUNTER_WIDTH((idx)) - 1), 0) + +#define DSU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]) { \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +#define DSU_EVENT_ATTR(_name, _config) \ + DSU_EXT_ATTR(_name, dsu_pmu_sysfs_event_show, (unsigned long)_config) + +#define DSU_FORMAT_ATTR(_name, _config) \ + DSU_EXT_ATTR(_name, dsu_pmu_sysfs_format_show, (char *)_config) + +#define DSU_CPUMASK_ATTR(_name, _config) \ + DSU_EXT_ATTR(_name, dsu_pmu_cpumask_show, (unsigned long)_config) + +struct dsu_hw_events { + DECLARE_BITMAP(used_mask, DSU_PMU_MAX_HW_CNTRS); + struct perf_event *events[DSU_PMU_MAX_HW_CNTRS]; +}; + +/* + * struct dsu_pmu - DSU PMU descriptor + * + * @pmu_lock : Protects accesses to DSU PMU register from normal vs + * interrupt handler contexts. + * @hw_events : Holds the event counter state. + * @associated_cpus : CPUs attached to the DSU. + * @active_cpu : CPU to which the PMU is bound for accesses. + * @cpuhp_node : Node for CPU hotplug notifier link. + * @num_counters : Number of event counters implemented by the PMU, + * excluding the cycle counter. + * @irq : Interrupt line for counter overflow. + * @cpmceid_bitmap : Bitmap for the availability of architected common + * events (event_code < 0x40). + */ +struct dsu_pmu { + struct pmu pmu; + struct device *dev; + raw_spinlock_t pmu_lock; + struct dsu_hw_events hw_events; + cpumask_t associated_cpus; + cpumask_t active_cpu; + struct hlist_node cpuhp_node; + u8 num_counters; + int irq; + DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS); +}; + +static unsigned long dsu_pmu_cpuhp_state; + +static inline struct dsu_pmu *to_dsu_pmu(struct pmu *pmu) +{ + return container_of(pmu, struct dsu_pmu, pmu); +} + +static ssize_t dsu_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *eattr = container_of(attr, + struct dev_ext_attribute, attr); + return snprintf(buf, PAGE_SIZE, "event=0x%lx\n", + (unsigned long)eattr->var); +} + +static ssize_t dsu_pmu_sysfs_format_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *eattr = container_of(attr, + struct dev_ext_attribute, attr); + return snprintf(buf, PAGE_SIZE, "%s\n", (char *)eattr->var); +} + +static ssize_t dsu_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu); + struct dev_ext_attribute *eattr = container_of(attr, + struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case DSU_ACTIVE_CPU_MASK: + cpumask = &dsu_pmu->active_cpu; + break; + case DSU_ASSOCIATED_CPU_MASK: + cpumask = &dsu_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +static struct attribute *dsu_pmu_format_attrs[] = { + DSU_FORMAT_ATTR(event, "config:0-31"), + NULL, +}; + +static const struct attribute_group dsu_pmu_format_attr_group = { + .name = "format", + .attrs = dsu_pmu_format_attrs, +}; + +static struct attribute *dsu_pmu_event_attrs[] = { + DSU_EVENT_ATTR(cycles, 0x11), + DSU_EVENT_ATTR(bus_access, 0x19), + DSU_EVENT_ATTR(memory_error, 0x1a), + DSU_EVENT_ATTR(bus_cycles, 0x1d), + DSU_EVENT_ATTR(l3d_cache_allocate, 0x29), + DSU_EVENT_ATTR(l3d_cache_refill, 0x2a), + DSU_EVENT_ATTR(l3d_cache, 0x2b), + DSU_EVENT_ATTR(l3d_cache_wb, 0x2c), + NULL, +}; + +static umode_t +dsu_pmu_event_attr_is_visible(struct kobject *kobj, struct attribute *attr, + int unused) +{ + struct pmu *pmu = dev_get_drvdata(kobj_to_dev(kobj)); + struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu); + struct dev_ext_attribute *eattr = container_of(attr, + struct dev_ext_attribute, attr.attr); + unsigned long evt = (unsigned long)eattr->var; + + return test_bit(evt, dsu_pmu->cpmceid_bitmap) ? attr->mode : 0; +} + +static const struct attribute_group dsu_pmu_events_attr_group = { + .name = "events", + .attrs = dsu_pmu_event_attrs, + .is_visible = dsu_pmu_event_attr_is_visible, +}; + +static struct attribute *dsu_pmu_cpumask_attrs[] = { + DSU_CPUMASK_ATTR(cpumask, DSU_ACTIVE_CPU_MASK), + DSU_CPUMASK_ATTR(associated_cpus, DSU_ASSOCIATED_CPU_MASK), + NULL, +}; + +static const struct attribute_group dsu_pmu_cpumask_attr_group = { + .attrs = dsu_pmu_cpumask_attrs, +}; + +static const struct attribute_group *dsu_pmu_attr_groups[] = { + &dsu_pmu_cpumask_attr_group, + &dsu_pmu_events_attr_group, + &dsu_pmu_format_attr_group, + NULL, +}; + +static int dsu_pmu_get_online_cpu_any_but(struct dsu_pmu *dsu_pmu, int cpu) +{ + struct cpumask online_supported; + + cpumask_and(&online_supported, + &dsu_pmu->associated_cpus, cpu_online_mask); + return cpumask_any_but(&online_supported, cpu); +} + +static inline bool dsu_pmu_counter_valid(struct dsu_pmu *dsu_pmu, u32 idx) +{ + return (idx < dsu_pmu->num_counters) || + (idx == DSU_PMU_IDX_CYCLE_COUNTER); +} + +static inline u64 dsu_pmu_read_counter(struct perf_event *event) +{ + u64 val; + unsigned long flags; + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + int idx = event->hw.idx; + + if (WARN_ON(!cpumask_test_cpu(smp_processor_id(), + &dsu_pmu->associated_cpus))) + return 0; + + if (!dsu_pmu_counter_valid(dsu_pmu, idx)) { + dev_err(event->pmu->dev, + "Trying reading invalid counter %d\n", idx); + return 0; + } + + raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags); + if (idx == DSU_PMU_IDX_CYCLE_COUNTER) + val = __dsu_pmu_read_pmccntr(); + else + val = __dsu_pmu_read_counter(idx); + raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); + + return val; +} + +static void dsu_pmu_write_counter(struct perf_event *event, u64 val) +{ + unsigned long flags; + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + int idx = event->hw.idx; + + if (WARN_ON(!cpumask_test_cpu(smp_processor_id(), + &dsu_pmu->associated_cpus))) + return; + + if (!dsu_pmu_counter_valid(dsu_pmu, idx)) { + dev_err(event->pmu->dev, + "writing to invalid counter %d\n", idx); + return; + } + + raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags); + if (idx == DSU_PMU_IDX_CYCLE_COUNTER) + __dsu_pmu_write_pmccntr(val); + else + __dsu_pmu_write_counter(idx, val); + raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); +} + +static int dsu_pmu_get_event_idx(struct dsu_hw_events *hw_events, + struct perf_event *event) +{ + int idx; + unsigned long evtype = event->attr.config; + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + unsigned long *used_mask = hw_events->used_mask; + + if (evtype == DSU_PMU_EVT_CYCLES) { + if (test_and_set_bit(DSU_PMU_IDX_CYCLE_COUNTER, used_mask)) + return -EAGAIN; + return DSU_PMU_IDX_CYCLE_COUNTER; + } + + idx = find_first_zero_bit(used_mask, dsu_pmu->num_counters); + if (idx >= dsu_pmu->num_counters) + return -EAGAIN; + set_bit(idx, hw_events->used_mask); + return idx; +} + +static void dsu_pmu_enable_counter(struct dsu_pmu *dsu_pmu, int idx) +{ + __dsu_pmu_counter_interrupt_enable(idx); + __dsu_pmu_enable_counter(idx); +} + +static void dsu_pmu_disable_counter(struct dsu_pmu *dsu_pmu, int idx) +{ + __dsu_pmu_disable_counter(idx); + __dsu_pmu_counter_interrupt_disable(idx); +} + +static inline void dsu_pmu_set_event(struct dsu_pmu *dsu_pmu, + struct perf_event *event) +{ + int idx = event->hw.idx; + unsigned long flags; + + if (!dsu_pmu_counter_valid(dsu_pmu, idx)) { + dev_err(event->pmu->dev, + "Trying to set invalid counter %d\n", idx); + return; + } + + raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags); + __dsu_pmu_set_event(idx, event->hw.config_base); + raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); +} + +static void dsu_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 delta, prev_count, new_count; + + do { + /* We may also be called from the irq handler */ + prev_count = local64_read(&hwc->prev_count); + new_count = dsu_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev_count, new_count) != + prev_count); + delta = (new_count - prev_count) & DSU_PMU_COUNTER_MASK(hwc->idx); + local64_add(delta, &event->count); +} + +static void dsu_pmu_read(struct perf_event *event) +{ + dsu_pmu_event_update(event); +} + +static inline u32 dsu_pmu_get_reset_overflow(void) +{ + return __dsu_pmu_get_reset_overflow(); +} + +/** + * dsu_pmu_set_event_period: Set the period for the counter. + * + * All DSU PMU event counters, except the cycle counter are 32bit + * counters. To handle cases of extreme interrupt latency, we program + * the counter with half of the max count for the counters. + */ +static void dsu_pmu_set_event_period(struct perf_event *event) +{ + int idx = event->hw.idx; + u64 val = DSU_PMU_COUNTER_MASK(idx) >> 1; + + local64_set(&event->hw.prev_count, val); + dsu_pmu_write_counter(event, val); +} + +static irqreturn_t dsu_pmu_handle_irq(int irq_num, void *dev) +{ + int i; + bool handled = false; + struct dsu_pmu *dsu_pmu = dev; + struct dsu_hw_events *hw_events = &dsu_pmu->hw_events; + unsigned long overflow; + + overflow = dsu_pmu_get_reset_overflow(); + if (!overflow) + return IRQ_NONE; + + for_each_set_bit(i, &overflow, DSU_PMU_MAX_HW_CNTRS) { + struct perf_event *event = hw_events->events[i]; + + if (!event) + continue; + dsu_pmu_event_update(event); + dsu_pmu_set_event_period(event); + handled = true; + } + + return IRQ_RETVAL(handled); +} + +static void dsu_pmu_start(struct perf_event *event, int pmu_flags) +{ + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + + /* We always reprogram the counter */ + if (pmu_flags & PERF_EF_RELOAD) + WARN_ON(!(event->hw.state & PERF_HES_UPTODATE)); + dsu_pmu_set_event_period(event); + if (event->hw.idx != DSU_PMU_IDX_CYCLE_COUNTER) + dsu_pmu_set_event(dsu_pmu, event); + event->hw.state = 0; + dsu_pmu_enable_counter(dsu_pmu, event->hw.idx); +} + +static void dsu_pmu_stop(struct perf_event *event, int pmu_flags) +{ + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + + if (event->hw.state & PERF_HES_STOPPED) + return; + dsu_pmu_disable_counter(dsu_pmu, event->hw.idx); + dsu_pmu_event_update(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int dsu_pmu_add(struct perf_event *event, int flags) +{ + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + struct dsu_hw_events *hw_events = &dsu_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &dsu_pmu->associated_cpus))) + return -ENOENT; + + idx = dsu_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hwc->idx = idx; + hw_events->events[idx] = event; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + dsu_pmu_start(event, PERF_EF_RELOAD); + + perf_event_update_userpage(event); + return 0; +} + +static void dsu_pmu_del(struct perf_event *event, int flags) +{ + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + struct dsu_hw_events *hw_events = &dsu_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + dsu_pmu_stop(event, PERF_EF_UPDATE); + hw_events->events[idx] = NULL; + clear_bit(idx, hw_events->used_mask); + perf_event_update_userpage(event); +} + +static void dsu_pmu_enable(struct pmu *pmu) +{ + u32 pmcr; + unsigned long flags; + struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu); + + /* If no counters are added, skip enabling the PMU */ + if (bitmap_empty(dsu_pmu->hw_events.used_mask, DSU_PMU_MAX_HW_CNTRS)) + return; + + raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags); + pmcr = __dsu_pmu_read_pmcr(); + pmcr |= CLUSTERPMCR_E; + __dsu_pmu_write_pmcr(pmcr); + raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); +} + +static void dsu_pmu_disable(struct pmu *pmu) +{ + u32 pmcr; + unsigned long flags; + struct dsu_pmu *dsu_pmu = to_dsu_pmu(pmu); + + raw_spin_lock_irqsave(&dsu_pmu->pmu_lock, flags); + pmcr = __dsu_pmu_read_pmcr(); + pmcr &= ~CLUSTERPMCR_E; + __dsu_pmu_write_pmcr(pmcr); + raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); +} + +static bool dsu_pmu_validate_event(struct pmu *pmu, + struct dsu_hw_events *hw_events, + struct perf_event *event) +{ + if (is_software_event(event)) + return true; + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + return dsu_pmu_get_event_idx(hw_events, event) >= 0; +} + +/* + * Make sure the group of events can be scheduled at once + * on the PMU. + */ +static bool dsu_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct dsu_hw_events fake_hw; + + if (event->group_leader == event) + return true; + + memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask)); + if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader)) + return false; + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling)) + return false; + } + return dsu_pmu_validate_event(event->pmu, &fake_hw, event); +} + +static int dsu_pmu_event_init(struct perf_event *event) +{ + struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* We don't support sampling */ + if (is_sampling_event(event)) { + dev_dbg(dsu_pmu->pmu.dev, "Can't support sampling events\n"); + return -EOPNOTSUPP; + } + + /* We cannot support task bound events */ + if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(dsu_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + if (has_branch_stack(event) || + event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest) { + dev_dbg(dsu_pmu->pmu.dev, "Can't support filtering\n"); + return -EINVAL; + } + + if (!cpumask_test_cpu(event->cpu, &dsu_pmu->associated_cpus)) { + dev_dbg(dsu_pmu->pmu.dev, + "Requested cpu is not associated with the DSU\n"); + return -EINVAL; + } + /* + * Choose the current active CPU to read the events. We don't want + * to migrate the event contexts, irq handling etc to the requested + * CPU. As long as the requested CPU is within the same DSU, we + * are fine. + */ + event->cpu = cpumask_first(&dsu_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + if (!dsu_pmu_validate_group(event)) + return -EINVAL; + + event->hw.config_base = event->attr.config; + return 0; +} + +static struct dsu_pmu *dsu_pmu_alloc(struct platform_device *pdev) +{ + struct dsu_pmu *dsu_pmu; + + dsu_pmu = devm_kzalloc(&pdev->dev, sizeof(*dsu_pmu), GFP_KERNEL); + if (!dsu_pmu) + return ERR_PTR(-ENOMEM); + + raw_spin_lock_init(&dsu_pmu->pmu_lock); + /* + * Initialise the number of counters to -1, until we probe + * the real number on a connected CPU. + */ + dsu_pmu->num_counters = -1; + return dsu_pmu; +} + +/** + * dsu_pmu_dt_get_cpus: Get the list of CPUs in the cluster. + */ +static int dsu_pmu_dt_get_cpus(struct device_node *dev, cpumask_t *mask) +{ + int i = 0, n, cpu; + struct device_node *cpu_node; + + n = of_count_phandle_with_args(dev, "cpus", NULL); + if (n <= 0) + return -ENODEV; + for (; i < n; i++) { + cpu_node = of_parse_phandle(dev, "cpus", i); + if (!cpu_node) + break; + cpu = of_cpu_node_to_id(cpu_node); + of_node_put(cpu_node); + /* + * We have to ignore the failures here and continue scanning + * the list to handle cases where the nr_cpus could be capped + * in the running kernel. + */ + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, mask); + } + return 0; +} + +/* + * dsu_pmu_probe_pmu: Probe the PMU details on a CPU in the cluster. + */ +static void dsu_pmu_probe_pmu(struct dsu_pmu *dsu_pmu) +{ + u64 num_counters; + u32 cpmceid[2]; + + num_counters = (__dsu_pmu_read_pmcr() >> CLUSTERPMCR_N_SHIFT) & + CLUSTERPMCR_N_MASK; + /* We can only support up to 31 independent counters */ + if (WARN_ON(num_counters > 31)) + num_counters = 31; + dsu_pmu->num_counters = num_counters; + if (!dsu_pmu->num_counters) + return; + cpmceid[0] = __dsu_pmu_read_pmceid(0); + cpmceid[1] = __dsu_pmu_read_pmceid(1); + bitmap_from_u32array(dsu_pmu->cpmceid_bitmap, + DSU_PMU_MAX_COMMON_EVENTS, + cpmceid, + ARRAY_SIZE(cpmceid)); +} + +static void dsu_pmu_set_active_cpu(int cpu, struct dsu_pmu *dsu_pmu) +{ + cpumask_set_cpu(cpu, &dsu_pmu->active_cpu); + if (irq_set_affinity_hint(dsu_pmu->irq, &dsu_pmu->active_cpu)) + pr_warn("Failed to set irq affinity to %d\n", cpu); +} + +/* + * dsu_pmu_init_pmu: Initialise the DSU PMU configurations if + * we haven't done it already. + */ +static void dsu_pmu_init_pmu(struct dsu_pmu *dsu_pmu) +{ + if (dsu_pmu->num_counters == -1) + dsu_pmu_probe_pmu(dsu_pmu); + /* Reset the interrupt overflow mask */ + dsu_pmu_get_reset_overflow(); +} + +static int dsu_pmu_device_probe(struct platform_device *pdev) +{ + int irq, rc; + struct dsu_pmu *dsu_pmu; + char *name; + static atomic_t pmu_idx = ATOMIC_INIT(-1); + + dsu_pmu = dsu_pmu_alloc(pdev); + if (IS_ERR(dsu_pmu)) + return PTR_ERR(dsu_pmu); + + rc = dsu_pmu_dt_get_cpus(pdev->dev.of_node, &dsu_pmu->associated_cpus); + if (rc) { + dev_warn(&pdev->dev, "Failed to parse the CPUs\n"); + return rc; + } + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + dev_warn(&pdev->dev, "Failed to find IRQ\n"); + return -EINVAL; + } + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s_%d", + PMUNAME, atomic_inc_return(&pmu_idx)); + if (!name) + return -ENOMEM; + rc = devm_request_irq(&pdev->dev, irq, dsu_pmu_handle_irq, + IRQF_NOBALANCING, name, dsu_pmu); + if (rc) { + dev_warn(&pdev->dev, "Failed to request IRQ %d\n", irq); + return rc; + } + + dsu_pmu->irq = irq; + platform_set_drvdata(pdev, dsu_pmu); + rc = cpuhp_state_add_instance(dsu_pmu_cpuhp_state, + &dsu_pmu->cpuhp_node); + if (rc) + return rc; + + dsu_pmu->pmu = (struct pmu) { + .task_ctx_nr = perf_invalid_context, + .module = THIS_MODULE, + .pmu_enable = dsu_pmu_enable, + .pmu_disable = dsu_pmu_disable, + .event_init = dsu_pmu_event_init, + .add = dsu_pmu_add, + .del = dsu_pmu_del, + .start = dsu_pmu_start, + .stop = dsu_pmu_stop, + .read = dsu_pmu_read, + + .attr_groups = dsu_pmu_attr_groups, + }; + + rc = perf_pmu_register(&dsu_pmu->pmu, name, -1); + if (rc) { + cpuhp_state_remove_instance(dsu_pmu_cpuhp_state, + &dsu_pmu->cpuhp_node); + irq_set_affinity_hint(dsu_pmu->irq, NULL); + } + + return rc; +} + +static int dsu_pmu_device_remove(struct platform_device *pdev) +{ + struct dsu_pmu *dsu_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&dsu_pmu->pmu); + cpuhp_state_remove_instance(dsu_pmu_cpuhp_state, &dsu_pmu->cpuhp_node); + irq_set_affinity_hint(dsu_pmu->irq, NULL); + + return 0; +} + +static const struct of_device_id dsu_pmu_of_match[] = { + { .compatible = "arm,dsu-pmu", }, + {}, +}; + +static struct platform_driver dsu_pmu_driver = { + .driver = { + .name = DRVNAME, + .of_match_table = of_match_ptr(dsu_pmu_of_match), + }, + .probe = dsu_pmu_device_probe, + .remove = dsu_pmu_device_remove, +}; + +static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct dsu_pmu *dsu_pmu = hlist_entry_safe(node, struct dsu_pmu, + cpuhp_node); + + if (!cpumask_test_cpu(cpu, &dsu_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&dsu_pmu->active_cpu)) + return 0; + + dsu_pmu_init_pmu(dsu_pmu); + dsu_pmu_set_active_cpu(cpu, dsu_pmu); + + return 0; +} + +static int dsu_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + int dst; + struct dsu_pmu *dsu_pmu = hlist_entry_safe(node, struct dsu_pmu, + cpuhp_node); + + if (!cpumask_test_and_clear_cpu(cpu, &dsu_pmu->active_cpu)) + return 0; + + dst = dsu_pmu_get_online_cpu_any_but(dsu_pmu, cpu); + /* If there are no active CPUs in the DSU, leave IRQ disabled */ + if (dst >= nr_cpu_ids) { + irq_set_affinity_hint(dsu_pmu->irq, NULL); + return 0; + } + + perf_pmu_migrate_context(&dsu_pmu->pmu, cpu, dst); + dsu_pmu_set_active_cpu(dst, dsu_pmu); + + return 0; +} + +static int __init dsu_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + DRVNAME, + dsu_pmu_cpu_online, + dsu_pmu_cpu_teardown); + if (ret < 0) + return ret; + dsu_pmu_cpuhp_state = ret; + return platform_driver_register(&dsu_pmu_driver); +} + +static void __exit dsu_pmu_exit(void) +{ + platform_driver_unregister(&dsu_pmu_driver); + cpuhp_remove_multi_state(dsu_pmu_cpuhp_state); +} + +module_init(dsu_pmu_init); +module_exit(dsu_pmu_exit); + +MODULE_DEVICE_TABLE(of, dsu_pmu_of_match); +MODULE_DESCRIPTION("Perf driver for ARM DynamIQ Shared Unit"); +MODULE_AUTHOR("Suzuki K Poulose "); +MODULE_LICENSE("GPL v2"); From a8ffaaa060b8d4da6138e0958cb0f45b73e1cb78 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Dec 2017 15:12:56 +0000 Subject: [PATCH 040/104] arm64: asid: Do not replace active_asids if already 0 Under some uncommon timing conditions, a generation check and xchg(active_asids, A1) in check_and_switch_context() on P1 can race with an ASID roll-over on P2. If P2 has not seen the update to active_asids[P1], it can re-allocate A1 to a new task T2 on P2. P1 ends up waiting on the spinlock since the xchg() returned 0 while P2 can go through a second ASID roll-over with (T2,A1,G2) active on P2. This roll-over copies active_asids[P1] == A1,G1 into reserved_asids[P1] and active_asids[P2] == A1,G2 into reserved_asids[P2]. A subsequent scheduling of T1 on P1 and T2 on P2 would match reserved_asids and get their generation bumped to G3: P1 P2 -- -- TTBR0.BADDR = T0 TTBR0.ASID = A0 asid_generation = G1 check_and_switch_context(T1,A1,G1) generation match check_and_switch_context(T2,A0,G0) new_context() ASID roll-over asid_generation = G2 flush_context() active_asids[P1] = 0 asid_map[A1] = 0 reserved_asids[P1] = A0,G0 xchg(active_asids, A1) active_asids[P1] = A1,G1 xchg returns 0 spin_lock_irqsave() allocated ASID (T2,A1,G2) asid_map[A1] = 1 active_asids[P2] = A1,G2 ... check_and_switch_context(T3,A0,G0) new_context() ASID roll-over asid_generation = G3 flush_context() active_asids[P1] = 0 asid_map[A1] = 1 reserved_asids[P1] = A1,G1 reserved_asids[P2] = A1,G2 allocated ASID (T3,A2,G3) asid_map[A2] = 1 active_asids[P2] = A2,G3 new_context() check_update_reserved_asid(A1,G1) matches reserved_asid[P1] reserved_asid[P1] = A1,G3 updated T1 ASID to (T1,A1,G3) check_and_switch_context(T2,A1,G2) new_context() check_and_switch_context(A1,G2) matches reserved_asids[P2] reserved_asids[P2] = A1,G3 updated T2 ASID to (T2,A1,G3) At this point, we have two tasks, T1 and T2 both using ASID A1 with the latest generation G3. Any of them is allowed to be scheduled on the other CPU leading to two different tasks with the same ASID on the same CPU. This patch changes the xchg to cmpxchg so that the active_asids is only updated if non-zero to avoid a race with an ASID roll-over on a different CPU. The ASID allocation algorithm has been formally verified using the TLA+ model checker (see https://git.kernel.org/pub/scm/linux/kernel/git/cmarinas/kernel-tla.git/tree/asidalloc.tla for the spec). Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/context.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 1cb3bc92ae5c..1fe71b9fcf35 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -194,26 +194,29 @@ set_asid: void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) { unsigned long flags; - u64 asid; + u64 asid, old_active_asid; asid = atomic64_read(&mm->context.id); /* * The memory ordering here is subtle. - * If our ASID matches the current generation, then we update - * our active_asids entry with a relaxed xchg. Racing with a - * concurrent rollover means that either: + * If our active_asids is non-zero and the ASID matches the current + * generation, then we update the active_asids entry with a relaxed + * cmpxchg. Racing with a concurrent rollover means that either: * - * - We get a zero back from the xchg and end up waiting on the + * - We get a zero back from the cmpxchg and end up waiting on the * lock. Taking the lock synchronises with the rollover and so * we are forced to see the updated generation. * - * - We get a valid ASID back from the xchg, which means the + * - We get a valid ASID back from the cmpxchg, which means the * relaxed xchg in flush_context will treat us as reserved * because atomic RmWs are totally ordered for a given location. */ - if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits) - && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid)) + old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); + if (old_active_asid && + !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) && + atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), + old_active_asid, asid)) goto switch_mm_fastpath; raw_spin_lock_irqsave(&cpu_asid_lock, flags); From 3b3b681097fae73b7f5dcdd42db6cfdf32943d4c Mon Sep 17 00:00:00 2001 From: Dongjiu Geng Date: Wed, 13 Dec 2017 18:13:56 +0800 Subject: [PATCH 041/104] arm64: v8.4: Support for new floating point multiplication instructions ARM v8.4 extensions add new neon instructions for performing a multiplication of each FP16 element of one vector with the corresponding FP16 element of a second vector, and to add or subtract this without an intermediate rounding to the corresponding FP32 element in a third vector. This patch detects this feature and let the userspace know about it via a HWCAP bit and MRS emulation. Cc: Dave Martin Reviewed-by: Suzuki K Poulose Signed-off-by: Dongjiu Geng Reviewed-by: Dave Martin Signed-off-by: Catalin Marinas --- Documentation/arm64/cpu-feature-registers.txt | 4 +++- Documentation/arm64/elf_hwcaps.txt | 4 ++++ arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/cpuinfo.c | 1 + 6 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt index bd9b3faab2c4..a70090b28b07 100644 --- a/Documentation/arm64/cpu-feature-registers.txt +++ b/Documentation/arm64/cpu-feature-registers.txt @@ -110,7 +110,9 @@ infrastructure: x--------------------------------------------------x | Name | bits | visible | |--------------------------------------------------| - | RES0 | [63-48] | n | + | RES0 | [63-52] | n | + |--------------------------------------------------| + | FHM | [51-48] | y | |--------------------------------------------------| | DP | [47-44] | y | |--------------------------------------------------| diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt index 89edba12a9e0..57324ee55ecc 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.txt @@ -158,3 +158,7 @@ HWCAP_SHA512 HWCAP_SVE Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001. + +HWCAP_ASIMDFHM + + Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001. diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index ec144f480b39..ab637886a6f9 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -419,6 +419,7 @@ #define SCTLR_EL1_CP15BEN (1 << 5) /* id_aa64isar0 */ +#define ID_AA64ISAR0_FHM_SHIFT 48 #define ID_AA64ISAR0_DP_SHIFT 44 #define ID_AA64ISAR0_SM4_SHIFT 40 #define ID_AA64ISAR0_SM3_SHIFT 36 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index cda76fa8b9b2..f018c3deea3b 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -43,5 +43,6 @@ #define HWCAP_ASIMDDP (1 << 20) #define HWCAP_SHA512 (1 << 21) #define HWCAP_SVE (1 << 22) +#define HWCAP_ASIMDFHM (1 << 23) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f0545dfe497..1f024049eb30 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -123,6 +123,7 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), @@ -1032,6 +1033,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 1e2554543506..7f94623df8a5 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -76,6 +76,7 @@ static const char *const hwcap_str[] = { "asimddp", "sha512", "sve", + "asimdfhm", NULL }; From be04a6d1126b02c6a28741155b899d648739fc5b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 16:15:59 +0000 Subject: [PATCH 042/104] arm64: use RET instruction for exiting the trampoline Speculation attacks against the entry trampoline can potentially resteer the speculative instruction stream through the indirect branch and into arbitrary gadgets within the kernel. This patch defends against these attacks by forcing a misprediction through the return stack: a dummy BL instruction loads an entry into the stack, so that the predicted program flow of the subsequent RET instruction is to a branch-to-self instruction which is finally resolved as a branch to the kernel vectors with speculation suppressed. Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 031392ee5f47..6ceed4877daf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1029,6 +1029,14 @@ alternative_else_nop_endif .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + /* + * Defend against branch aliasing attacks by pushing a dummy + * entry onto the return stack and using a RET instruction to + * enter the full-fat kernel vectors. + */ + bl 2f + b . +2: tramp_map_kernel x30 #ifdef CONFIG_RANDOMIZE_BASE adr x30, tramp_vectors + PAGE_SIZE @@ -1041,7 +1049,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 msr vbar_el1, x30 add x30, x30, #(1b - tramp_vectors) isb - br x30 + ret .endm .macro tramp_exit, regsize = 64 From 0617052ddde355ee663b2f048e67dd381e5ebd6a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 16:19:39 +0000 Subject: [PATCH 043/104] arm64: Kconfig: Reword UNMAP_KERNEL_AT_EL0 kconfig entry Although CONFIG_UNMAP_KERNEL_AT_EL0 does make KASLR more robust, it's actually more useful as a mitigation against speculation attacks that can leak arbitrary kernel data to userspace through speculation. Reword the Kconfig help message to reflect this, and make the option depend on EXPERT so that it is on by default for the majority of users. Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9b1330806f39..cb7a70e686cb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -863,15 +863,14 @@ config FORCE_MAX_ZONEORDER 4M allocations matching the default size used by generic code. config UNMAP_KERNEL_AT_EL0 - bool "Unmap kernel when running in userspace (aka \"KAISER\")" + bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT default y help - Some attacks against KASLR make use of the timing difference between - a permission fault which could arise from a page table entry that is - present in the TLB, and a translation fault which always requires a - page table walk. This option defends against these attacks by unmapping - the kernel whilst running in userspace, therefore forcing translation - faults for all of kernel space. + Speculation attacks against some high-performance processors can + be used to bypass MMU permission checks and leak kernel data to + userspace. This can be defended against by unmapping the kernel + when running in userspace, mapping it back in on exception entry + via a trampoline page in the vector table. If unsure, say Y. From 179a56f6f9fbda28f6ca07db1fc3dfad6bc7343c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 27 Nov 2017 18:29:30 +0000 Subject: [PATCH 044/104] arm64: Take into account ID_AA64PFR0_EL1.CSV3 For non-KASLR kernels where the KPTI behaviour has not been overridden on the command line we can use ID_AA64PFR0_EL1.CSV3 to determine whether or not we should unmap the kernel whilst running at EL0. Reviewed-by: Suzuki K Poulose Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/cpufeature.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index ab637886a6f9..262ae18f0e05 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -438,6 +438,7 @@ #define ID_AA64ISAR1_DPB_SHIFT 0 /* id_aa64pfr0 */ +#define ID_AA64PFR0_CSV3_SHIFT 60 #define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_ASIMD_SHIFT 20 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1f024049eb30..0f3c96172f0f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -146,6 +146,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), @@ -852,6 +853,8 @@ static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, int __unused) { + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + /* Forced on command line? */ if (__kpti_forced) { pr_info_once("kernel page table isolation forced %s by command line option\n", @@ -863,7 +866,9 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) return true; - return false; + /* Defer to CPU feature registers */ + return !cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV3_SHIFT); } static int __init parse_kpti(char *str) @@ -968,6 +973,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { }, #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 { + .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, .def_scope = SCOPE_SYSTEM, .matches = unmap_kernel_at_el0, From 0a0d111d40fd1dc588cc590fab6b55d86ddc71d3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jan 2018 21:37:25 +0000 Subject: [PATCH 045/104] arm64: cpufeature: Pass capability structure to ->enable callback In order to invoke the CPU capability ->matches callback from the ->enable callback for applying local-CPU workarounds, we need a handle on the capability structure. This patch passes a pointer to the capability structure to the ->enable callback. Reviewed-by: Suzuki K Poulose Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0f3c96172f0f..991922b45d3d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1154,7 +1154,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) * uses an IPI, giving us a PSTATE that disappears when * we return. */ - stop_machine(caps->enable, NULL, cpu_online_mask); + stop_machine(caps->enable, (void *)caps, cpu_online_mask); } } } @@ -1197,7 +1197,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) cpu_die_early(); } if (caps->enable) - caps->enable(NULL); + caps->enable((void *)caps); } } From d68e3ba5303f7e1099f51fdcd155f5263da8569b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jan 2018 21:45:41 +0000 Subject: [PATCH 046/104] drivers/firmware: Expose psci_get_version through psci_ops structure Entry into recent versions of ARM Trusted Firmware will invalidate the CPU branch predictor state in order to protect against aliasing attacks. This patch exposes the PSCI "VERSION" function via psci_ops, so that it can be invoked outside of the PSCI driver where necessary. Acked-by: Lorenzo Pieralisi Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- drivers/firmware/psci.c | 2 ++ include/linux/psci.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c index d687ca3d5049..8b25d31e8401 100644 --- a/drivers/firmware/psci.c +++ b/drivers/firmware/psci.c @@ -496,6 +496,8 @@ static void __init psci_init_migrate(void) static void __init psci_0_2_set_functions(void) { pr_info("Using standard PSCI v0.2 function IDs\n"); + psci_ops.get_version = psci_get_version; + psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_FN_NATIVE(0_2, CPU_SUSPEND); psci_ops.cpu_suspend = psci_cpu_suspend; diff --git a/include/linux/psci.h b/include/linux/psci.h index bdea1cb5e1db..6306ab10af18 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -26,6 +26,7 @@ int psci_cpu_init_idle(unsigned int cpu); int psci_cpu_suspend_enter(unsigned long index); struct psci_operations { + u32 (*get_version)(void); int (*cpu_suspend)(u32 state, unsigned long entry_point); int (*cpu_off)(u32 state); int (*cpu_on)(unsigned long cpuid, unsigned long entry_point); From 95e3de3590e3f2358bb13f013911bc1bfa5d3f53 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 Jan 2018 18:19:39 +0000 Subject: [PATCH 047/104] arm64: Move post_ttbr_update_workaround to C code We will soon need to invoke a CPU-specific function pointer after changing page tables, so move post_ttbr_update_workaround out into C code to make this possible. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 13 ------------- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/context.c | 9 +++++++++ arch/arm64/mm/proc.S | 3 +-- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 215a49213507..a6f90b648655 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -492,19 +492,6 @@ alternative_endif mrs \rd, sp_el0 .endm -/* - * Errata workaround post TTBRx_EL1 update. - */ - .macro post_ttbr_update_workaround -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -alternative_if ARM64_WORKAROUND_CAVIUM_27456 - ic iallu - dsb nsh - isb -alternative_else_nop_endif -#endif - .endm - /* * Arrange a physical address in a TTBR register, taking care of 52-bit * addresses. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6ceed4877daf..80b539845da6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -277,7 +277,7 @@ alternative_else_nop_endif * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache * corruption). */ - post_ttbr_update_workaround + bl post_ttbr_update_workaround .endif 1: .if \el != 0 diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 1fe71b9fcf35..511bd1e79b69 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -242,6 +242,15 @@ switch_mm_fastpath: cpu_switch_mm(mm->pgd, mm); } +/* Errata workaround post TTBRx_EL1 update. */ +asmlinkage void post_ttbr_update_workaround(void) +{ + asm(ALTERNATIVE("nop; nop; nop", + "ic iallu; dsb nsh; isb", + ARM64_WORKAROUND_CAVIUM_27456, + CONFIG_CAVIUM_ERRATUM_27456)); +} + static int asids_init(void) { asid_bits = get_cpu_asid_bits(); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index bc334588f234..bc86f7ef8620 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -146,8 +146,7 @@ ENTRY(cpu_do_switch_mm) phys_to_ttbr x0, x2 msr ttbr0_el1, x2 // now update TTBR0 isb - post_ttbr_update_workaround - ret + b post_ttbr_update_workaround // Back to C code... ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From 0f15adbb2861ce6f75ccfc5a92b19eae0ef327d0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Jan 2018 11:17:58 +0000 Subject: [PATCH 048/104] arm64: Add skeleton to harden the branch predictor against aliasing attacks Aliasing attacks against CPU branch predictors can allow an attacker to redirect speculative control flow on some CPUs and potentially divulge information from one context to another. This patch adds initial skeleton code behind a new Kconfig option to enable implementation-specific mitigations against these attacks for CPUs that are affected. Co-developed-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 17 ++++++++ arch/arm64/include/asm/cpucaps.h | 3 +- arch/arm64/include/asm/mmu.h | 37 ++++++++++++++++ arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/Makefile | 4 ++ arch/arm64/kernel/bpi.S | 55 ++++++++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 74 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 7 ++- arch/arm64/mm/context.c | 2 + arch/arm64/mm/fault.c | 17 ++++++++ 11 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/kernel/bpi.S diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cb7a70e686cb..664fadc2aa2e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -874,6 +874,23 @@ config UNMAP_KERNEL_AT_EL0 If unsure, say Y. +config HARDEN_BRANCH_PREDICTOR + bool "Harden the branch predictor against aliasing attacks" if EXPERT + default y + help + Speculation attacks against some high-performance processors rely on + being able to manipulate the branch predictor for a victim context by + executing aliasing branches in the attacker context. Such attacks + can be partially mitigated against by clearing internal branch + predictor state and limiting the prediction logic in some situations. + + This config option will take CPU-specific actions to harden the + branch predictor against aliasing attacks and may rely on specific + instruction sequences or control bits being set by the system + firmware. + + If unsure, say Y. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on COMPAT diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b4537ffd1018..51616e77fe6b 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -42,7 +42,8 @@ #define ARM64_HAS_DCPOP 21 #define ARM64_SVE 22 #define ARM64_UNMAP_KERNEL_AT_EL0 23 +#define ARM64_HARDEN_BRANCH_PREDICTOR 24 -#define ARM64_NCAPS 24 +#define ARM64_NCAPS 25 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 6f7bdb89817f..6dd83d75b82a 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -41,6 +41,43 @@ static inline bool arm64_kernel_unmapped_at_el0(void) cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); } +typedef void (*bp_hardening_cb_t)(void); + +struct bp_hardening_data { + int hyp_vectors_slot; + bp_hardening_cb_t fn; +}; + +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[]; + +DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return this_cpu_ptr(&bp_hardening_data); +} + +static inline void arm64_apply_bp_hardening(void) +{ + struct bp_hardening_data *d; + + if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) + return; + + d = arm64_get_bp_hardening_data(); + if (d->fn) + d->fn(); +} +#else +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return NULL; +} + +static inline void arm64_apply_bp_hardening(void) { } +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + extern void paging_init(void); extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 262ae18f0e05..54e99af043c6 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -439,6 +439,7 @@ /* id_aa64pfr0 */ #define ID_AA64PFR0_CSV3_SHIFT 60 +#define ID_AA64PFR0_CSV2_SHIFT 56 #define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_ASIMD_SHIFT 20 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 067baace74a0..0c760db04858 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -53,6 +53,10 @@ arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +ifeq ($(CONFIG_KVM),y) +arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o +endif + obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) head-y := head.o diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S new file mode 100644 index 000000000000..06a931eb2673 --- /dev/null +++ b/arch/arm64/kernel/bpi.S @@ -0,0 +1,55 @@ +/* + * Contains CPU specific branch predictor invalidation sequences + * + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +.macro ventry target + .rept 31 + nop + .endr + b \target +.endm + +.macro vectors target + ventry \target + 0x000 + ventry \target + 0x080 + ventry \target + 0x100 + ventry \target + 0x180 + + ventry \target + 0x200 + ventry \target + 0x280 + ventry \target + 0x300 + ventry \target + 0x380 + + ventry \target + 0x400 + ventry \target + 0x480 + ventry \target + 0x500 + ventry \target + 0x580 + + ventry \target + 0x600 + ventry \target + 0x680 + ventry \target + 0x700 + ventry \target + 0x780 +.endm + + .align 11 +ENTRY(__bp_harden_hyp_vecs_start) + .rept 4 + vectors __kvm_hyp_vector + .endr +ENTRY(__bp_harden_hyp_vecs_end) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 0e27f86ee709..16ea5c6f314e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -46,6 +46,80 @@ static int cpu_enable_trap_ctr_access(void *__unused) return 0; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include +#include + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +#ifdef CONFIG_KVM +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K); + int i; + + for (i = 0; i < SZ_2K; i += 0x80) + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); + + flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); +} + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + static int last_slot = -1; + static DEFINE_SPINLOCK(bp_lock); + int cpu, slot = -1; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.fn, cpu) == fn) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + last_slot++; + BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) + / SZ_2K) <= last_slot); + slot = last_slot; + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.fn, fn); + spin_unlock(&bp_lock); +} +#else +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + __this_cpu_write(bp_hardening_data.fn, fn); +} +#endif /* CONFIG_KVM */ + +static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, + bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + u64 pfr0; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return; + + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + return; + + __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); +} +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + #define MIDR_RANGE(model, min, max) \ .def_scope = SCOPE_LOCAL_CPU, \ .matches = is_affected_midr_range, \ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 991922b45d3d..da6722db50b0 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -147,6 +147,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 80b539845da6..07a7d4db8ec4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -721,12 +721,15 @@ el0_ia: * Instruction abort handling */ mrs x26, far_el1 - enable_daif + enable_da_f +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 mov x2, sp - bl do_mem_abort + bl do_el0_ia_bp_hardening b ret_to_user el0_fpsimd_acc: /* diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 511bd1e79b69..ff99a880a730 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -249,6 +249,8 @@ asmlinkage void post_ttbr_update_workaround(void) "ic iallu; dsb nsh; isb", ARM64_WORKAROUND_CAVIUM_27456, CONFIG_CAVIUM_ERRATUM_27456)); + + arm64_apply_bp_hardening(); } static int asids_init(void) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 22168cd0dde7..0e671ddf4855 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -708,6 +708,23 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, arm64_notify_die("", regs, &info, esr); } +asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr, + unsigned int esr, + struct pt_regs *regs) +{ + /* + * We've taken an instruction abort from userspace and not yet + * re-enabled IRQs. If the address is a kernel address, apply + * BP hardening prior to enabling IRQs and pre-emption. + */ + if (addr > TASK_SIZE) + arm64_apply_bp_hardening(); + + local_irq_enable(); + do_mem_abort(addr, esr, regs); +} + + asmlinkage void __exception do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) From 6840bdd73d07216ab4bc46f5a8768c37ea519038 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 3 Jan 2018 16:38:35 +0000 Subject: [PATCH 049/104] arm64: KVM: Use per-CPU vector when BP hardening is enabled Now that we have per-CPU vectors, let's plug then in the KVM/arm64 code. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm/include/asm/kvm_mmu.h | 10 +++++++++ arch/arm64/include/asm/kvm_mmu.h | 38 ++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/switch.c | 2 +- virt/kvm/arm/arm.c | 8 ++++++- 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 8c5643e2eea4..a2d176a308bd 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -226,6 +226,16 @@ static inline unsigned int kvm_get_vmid_bits(void) return 8; } +static inline void *kvm_get_hyp_vector(void) +{ + return kvm_ksym_ref(__kvm_hyp_vector); +} + +static inline int kvm_map_vectors(void) +{ + return 0; +} + #define kvm_phys_to_vttbr(addr) (addr) #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b33bdb5eeb3d..72e279dbae5f 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -322,6 +322,44 @@ static inline unsigned int kvm_get_vmid_bits(void) return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include + +static inline void *kvm_get_hyp_vector(void) +{ + struct bp_hardening_data *data = arm64_get_bp_hardening_data(); + void *vect = kvm_ksym_ref(__kvm_hyp_vector); + + if (data->fn) { + vect = __bp_harden_hyp_vecs_start + + data->hyp_vectors_slot * SZ_2K; + + if (!has_vhe()) + vect = lm_alias(vect); + } + + return vect; +} + +static inline int kvm_map_vectors(void) +{ + return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start), + kvm_ksym_ref(__bp_harden_hyp_vecs_end), + PAGE_HYP_EXEC); +} + +#else +static inline void *kvm_get_hyp_vector(void) +{ + return kvm_ksym_ref(__kvm_hyp_vector); +} + +static inline int kvm_map_vectors(void) +{ + return 0; +} +#endif + #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index f7c651f3a8c0..8d4f3c9d6dc4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -52,7 +52,7 @@ static void __hyp_text __activate_traps_vhe(void) val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN); write_sysreg(val, cpacr_el1); - write_sysreg(__kvm_hyp_vector, vbar_el1); + write_sysreg(kvm_get_hyp_vector(), vbar_el1); } static void __hyp_text __activate_traps_nvhe(void) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c8d49879307f..2df6a5c42f77 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1158,7 +1158,7 @@ static void cpu_init_hyp_mode(void *dummy) pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); + vector_ptr = (unsigned long)kvm_get_hyp_vector(); __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); @@ -1403,6 +1403,12 @@ static int init_hyp_mode(void) goto out_err; } + err = kvm_map_vectors(); + if (err) { + kvm_err("Cannot map vectors\n"); + goto out_err; + } + /* * Map the Hyp stack pages */ From 90348689d500410ca7a55624c667f956771dce7f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 3 Jan 2018 16:38:37 +0000 Subject: [PATCH 050/104] arm64: KVM: Make PSCI_VERSION a fast path For those CPUs that require PSCI to perform a BP invalidation, going all the way to the PSCI code for not much is a waste of precious cycles. Let's terminate that call as early as possible. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kvm/hyp/switch.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8d4f3c9d6dc4..4d273f6d0e69 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -341,6 +342,18 @@ again: if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu)) goto again; + if (exit_code == ARM_EXCEPTION_TRAP && + (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC64 || + kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32) && + vcpu_get_reg(vcpu, 0) == PSCI_0_2_FN_PSCI_VERSION) { + u64 val = PSCI_RET_NOT_SUPPORTED; + if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) + val = 2; + + vcpu_set_reg(vcpu, 0, val); + goto again; + } + if (static_branch_unlikely(&vgic_v2_cpuif_trap) && exit_code == ARM_EXCEPTION_TRAP) { bool valid; From a65d219fe5dc7887fd5ca04c2ac3e9a34feb8dfc Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Jan 2018 11:19:34 +0000 Subject: [PATCH 051/104] arm64: cputype: Add missing MIDR values for Cortex-A72 and Cortex-A75 Hook up MIDR values for the Cortex-A72 and Cortex-A75 CPUs, since they will soon need MIDR matches for hardening the branch predictor. Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 235e77d98261..84385b94e70b 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -79,8 +79,10 @@ #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 #define ARM_CPU_PART_CORTEX_A57 0xD07 +#define ARM_CPU_PART_CORTEX_A72 0xD08 #define ARM_CPU_PART_CORTEX_A53 0xD03 #define ARM_CPU_PART_CORTEX_A73 0xD09 +#define ARM_CPU_PART_CORTEX_A75 0xD0A #define APM_CPU_PART_POTENZA 0x000 @@ -94,7 +96,9 @@ #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) +#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From aa6acde65e03186b5add8151e1ffe36c3c62639b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Jan 2018 12:46:21 +0000 Subject: [PATCH 052/104] arm64: Implement branch predictor hardening for affected Cortex-A CPUs Cortex-A57, A72, A73 and A75 are susceptible to branch predictor aliasing and can theoretically be attacked by malicious code. This patch implements a PSCI-based mitigation for these CPUs when available. The call into firmware will invalidate the branch predictor state, preventing any malicious entries from affecting other victim contexts. Co-developed-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/bpi.S | 24 +++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S index 06a931eb2673..dec95bd82e31 100644 --- a/arch/arm64/kernel/bpi.S +++ b/arch/arm64/kernel/bpi.S @@ -53,3 +53,27 @@ ENTRY(__bp_harden_hyp_vecs_start) vectors __kvm_hyp_vector .endr ENTRY(__bp_harden_hyp_vecs_end) +ENTRY(__psci_hyp_bp_inval_start) + sub sp, sp, #(8 * 18) + stp x16, x17, [sp, #(16 * 0)] + stp x14, x15, [sp, #(16 * 1)] + stp x12, x13, [sp, #(16 * 2)] + stp x10, x11, [sp, #(16 * 3)] + stp x8, x9, [sp, #(16 * 4)] + stp x6, x7, [sp, #(16 * 5)] + stp x4, x5, [sp, #(16 * 6)] + stp x2, x3, [sp, #(16 * 7)] + stp x0, x1, [sp, #(16 * 8)] + mov x0, #0x84000000 + smc #0 + ldp x16, x17, [sp, #(16 * 0)] + ldp x14, x15, [sp, #(16 * 1)] + ldp x12, x13, [sp, #(16 * 2)] + ldp x10, x11, [sp, #(16 * 3)] + ldp x8, x9, [sp, #(16 * 4)] + ldp x6, x7, [sp, #(16 * 5)] + ldp x4, x5, [sp, #(16 * 6)] + ldp x2, x3, [sp, #(16 * 7)] + ldp x0, x1, [sp, #(16 * 8)] + add sp, sp, #(8 * 18) +ENTRY(__psci_hyp_bp_inval_end) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 16ea5c6f314e..cb0fb3796bb8 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -53,6 +53,8 @@ static int cpu_enable_trap_ctr_access(void *__unused) DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); #ifdef CONFIG_KVM +extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[]; + static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) { @@ -94,6 +96,9 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, spin_unlock(&bp_lock); } #else +#define __psci_hyp_bp_inval_start NULL +#define __psci_hyp_bp_inval_end NULL + static void __install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, const char *hyp_vecs_end) @@ -118,6 +123,21 @@ static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); } + +#include + +static int enable_psci_bp_hardening(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + + if (psci_ops.get_version) + install_bp_hardening_cb(entry, + (bp_hardening_cb_t)psci_ops.get_version, + __psci_hyp_bp_inval_start, + __psci_hyp_bp_inval_end); + + return 0; +} #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ #define MIDR_RANGE(model, min, max) \ @@ -260,6 +280,28 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_WORKAROUND_858921, MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, +#endif +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + .enable = enable_psci_bp_hardening, + }, #endif { } From ec82b567a74fbdffdf418d4bb381d55f6a9096af Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Jan 2018 14:28:59 -0600 Subject: [PATCH 053/104] arm64: Implement branch predictor hardening for Falkor Falkor is susceptible to branch predictor aliasing and can theoretically be attacked by malicious code. This patch implements a mitigation for these attacks, preventing any malicious entries from affecting other victim contexts. Signed-off-by: Shanker Donthineni [will: fix label name when !CONFIG_KVM and remove references to MIDR_FALKOR] Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/kernel/bpi.S | 8 +++++++ arch/arm64/kernel/cpu_errata.c | 40 ++++++++++++++++++++++++++++++-- arch/arm64/kvm/hyp/entry.S | 12 ++++++++++ arch/arm64/kvm/hyp/switch.c | 8 +++++++ 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 51616e77fe6b..7049b4802587 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -43,7 +43,8 @@ #define ARM64_SVE 22 #define ARM64_UNMAP_KERNEL_AT_EL0 23 #define ARM64_HARDEN_BRANCH_PREDICTOR 24 +#define ARM64_HARDEN_BP_POST_GUEST_EXIT 25 -#define ARM64_NCAPS 25 +#define ARM64_NCAPS 26 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ab4d0a926043..24961b732e65 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -68,6 +68,8 @@ extern u32 __kvm_get_mdcr_el2(void); extern u32 __init_stage2_translation(void); +extern void __qcom_hyp_sanitize_btac_predictors(void); + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S index dec95bd82e31..76225c2611ea 100644 --- a/arch/arm64/kernel/bpi.S +++ b/arch/arm64/kernel/bpi.S @@ -77,3 +77,11 @@ ENTRY(__psci_hyp_bp_inval_start) ldp x0, x1, [sp, #(16 * 8)] add sp, sp, #(8 * 18) ENTRY(__psci_hyp_bp_inval_end) + +ENTRY(__qcom_hyp_sanitize_link_stack_start) + stp x29, x30, [sp, #-16]! + .rept 16 + bl . + 4 + .endr + ldp x29, x30, [sp], #16 +ENTRY(__qcom_hyp_sanitize_link_stack_end) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index cb0fb3796bb8..70e5f1826fd9 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -54,6 +54,8 @@ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); #ifdef CONFIG_KVM extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[]; +extern char __qcom_hyp_sanitize_link_stack_start[]; +extern char __qcom_hyp_sanitize_link_stack_end[]; static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) @@ -96,8 +98,10 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, spin_unlock(&bp_lock); } #else -#define __psci_hyp_bp_inval_start NULL -#define __psci_hyp_bp_inval_end NULL +#define __psci_hyp_bp_inval_start NULL +#define __psci_hyp_bp_inval_end NULL +#define __qcom_hyp_sanitize_link_stack_start NULL +#define __qcom_hyp_sanitize_link_stack_end NULL static void __install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, @@ -138,6 +142,29 @@ static int enable_psci_bp_hardening(void *data) return 0; } + +static void qcom_link_stack_sanitization(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static int qcom_enable_link_stack_sanitization(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + + install_bp_hardening_cb(entry, qcom_link_stack_sanitization, + __qcom_hyp_sanitize_link_stack_start, + __qcom_hyp_sanitize_link_stack_end); + + return 0; +} #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ #define MIDR_RANGE(model, min, max) \ @@ -302,6 +329,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), .enable = enable_psci_bp_hardening, }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + .enable = qcom_enable_link_stack_sanitization, + }, + { + .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + }, #endif { } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 12ee62d6d410..9c45c6af1f58 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -196,3 +196,15 @@ alternative_endif eret ENDPROC(__fpsimd_guest_restore) + +ENTRY(__qcom_hyp_sanitize_btac_predictors) + /** + * Call SMC64 with Silicon provider serviceID 23<<8 (0xc2001700) + * 0xC2000000-0xC200FFFF: assigned to SiP Service Calls + * b15-b0: contains SiP functionID + */ + movz x0, #0x1700 + movk x0, #0xc200, lsl #16 + smc #0 + ret +ENDPROC(__qcom_hyp_sanitize_btac_predictors) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 4d273f6d0e69..170e1917f83c 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -406,6 +406,14 @@ again: /* 0 falls through to be handled out of EL2 */ } + if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) { + u32 midr = read_cpuid_id(); + + /* Apply BTAC predictors mitigation to all Falkor chips */ + if ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1) + __qcom_hyp_sanitize_btac_predictors(); + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); From 0d90718871fe80f019b7295ec9d2b23121e396fb Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sun, 7 Jan 2018 22:53:35 -0800 Subject: [PATCH 054/104] arm64: cputype: Add MIDR values for Cavium ThunderX2 CPUs Add the older Broadcom ID as well as the new Cavium ID for ThunderX2 CPUs. Signed-off-by: Jayachandran C Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 84385b94e70b..cce5735a677c 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -89,6 +89,7 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 +#define CAVIUM_CPU_PART_THUNDERX2 0x0AF #define BRCM_CPU_PART_VULCAN 0x516 @@ -102,6 +103,8 @@ #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) +#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) +#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) #ifndef __ASSEMBLY__ From 32b03d1059667a39e089c45ee38ec9c16332430f Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:03 +0000 Subject: [PATCH 055/104] KVM: arm64: Store vcpu on the stack during __guest_enter() KVM uses tpidr_el2 as its private vcpu register, which makes sense for non-vhe world switch as only KVM can access this register. This means vhe Linux has to use tpidr_el1, which KVM has to save/restore as part of the host context. If the SDEI handler code runs behind KVMs back, it mustn't access any per-cpu variables. To allow this on systems with vhe we need to make the host use tpidr_el2, saving KVM from save/restoring it. __guest_enter() stores the host_ctxt on the stack, do the same with the vcpu. Signed-off-by: James Morse Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/kvm/hyp/entry.S | 10 +++++++--- arch/arm64/kvm/hyp/hyp-entry.S | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 9c45c6af1f58..fe4678f20a85 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -62,8 +62,8 @@ ENTRY(__guest_enter) // Store the host regs save_callee_saved_regs x1 - // Store the host_ctxt for use at exit time - str x1, [sp, #-16]! + // Store host_ctxt and vcpu for use at exit time + stp x1, x0, [sp, #-16]! add x18, x0, #VCPU_CONTEXT @@ -159,6 +159,10 @@ abort_guest_exit_end: ENDPROC(__guest_exit) ENTRY(__fpsimd_guest_restore) + // x0: esr + // x1: vcpu + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack stp x2, x3, [sp, #-16]! stp x4, lr, [sp, #-16]! @@ -173,7 +177,7 @@ alternative_else alternative_endif isb - mrs x3, tpidr_el2 + mov x3, x1 ldr x0, [x3, #VCPU_HOST_CONTEXT] kern_hyp_va x0 diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 5170ce1021da..fce7cc507e0a 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -104,6 +104,7 @@ el1_trap: /* * x0: ESR_EC */ + ldr x1, [sp, #16 + 8] // vcpu stored by __guest_enter /* * We trap the first access to the FP/SIMD to save the host context @@ -116,19 +117,18 @@ alternative_if_not ARM64_HAS_NO_FPSIMD b.eq __fpsimd_guest_restore alternative_else_nop_endif - mrs x1, tpidr_el2 mov x0, #ARM_EXCEPTION_TRAP b __guest_exit el1_irq: stp x0, x1, [sp, #-16]! - mrs x1, tpidr_el2 + ldr x1, [sp, #16 + 8] mov x0, #ARM_EXCEPTION_IRQ b __guest_exit el1_error: stp x0, x1, [sp, #-16]! - mrs x1, tpidr_el2 + ldr x1, [sp, #16 + 8] mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit From 36989e7fd386a9a5822c48691473863f8fbb404d Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:04 +0000 Subject: [PATCH 056/104] KVM: arm/arm64: Convert kvm_host_cpu_state to a static per-cpu allocation kvm_host_cpu_state is a per-cpu allocation made from kvm_arch_init() used to store the host EL1 registers when KVM switches to a guest. Make it easier for ASM to generate pointers into this per-cpu memory by making it a static allocation. Signed-off-by: James Morse Acked-by: Christoffer Dall Signed-off-by: Catalin Marinas --- virt/kvm/arm/arm.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 2df6a5c42f77..2fc6009a766c 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -53,8 +53,8 @@ __asm__(".arch_extension virt"); #endif +DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -static kvm_cpu_context_t __percpu *kvm_host_cpu_state; /* Per-CPU variable containing the currently running vcpu. */ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); @@ -354,7 +354,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vcpu->cpu = cpu; - vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); + vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state); kvm_arm_set_running_vcpu(vcpu); kvm_vgic_load(vcpu); @@ -1272,19 +1272,8 @@ static inline void hyp_cpu_pm_exit(void) } #endif -static void teardown_common_resources(void) -{ - free_percpu(kvm_host_cpu_state); -} - static int init_common_resources(void) { - kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t); - if (!kvm_host_cpu_state) { - kvm_err("Cannot allocate host CPU state\n"); - return -ENOMEM; - } - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); @@ -1426,7 +1415,7 @@ static int init_hyp_mode(void) for_each_possible_cpu(cpu) { kvm_cpu_context_t *cpu_ctxt; - cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); + cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { @@ -1550,7 +1539,6 @@ out_hyp: if (!in_hyp_mode) teardown_hyp_mode(); out_err: - teardown_common_resources(); return err; } From c97e166e54b662717d20ec2e36761758d2b6a7c2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:05 +0000 Subject: [PATCH 057/104] KVM: arm64: Change hyp_panic()s dependency on tpidr_el2 Make tpidr_el2 a cpu-offset for per-cpu variables in the same way the host uses tpidr_el1. This lets tpidr_el{1,2} have the same value, and on VHE they can be the same register. KVM calls hyp_panic() when anything unexpected happens. This may occur while a guest owns the EL1 registers. KVM stashes the vcpu pointer in tpidr_el2, which it uses to find the host context in order to restore the host EL1 registers before parachuting into the host's panic(). The host context is a struct kvm_cpu_context allocated in the per-cpu area, and mapped to hyp. Given the per-cpu offset for this CPU, this is easy to find. Change hyp_panic() to take a pointer to the struct kvm_cpu_context. Wrap these calls with an asm function that retrieves the struct kvm_cpu_context from the host's per-cpu area. Copy the per-cpu offset from the hosts tpidr_el1 into tpidr_el2 during kvm init. (Later patches will make this unnecessary for VHE hosts) We print out the vcpu pointer as part of the panic message. Add a back reference to the 'running vcpu' in the host cpu context to preserve this. Signed-off-by: James Morse Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/hyp/hyp-entry.S | 12 ++++++++++++ arch/arm64/kvm/hyp/s2-setup.c | 3 +++ arch/arm64/kvm/hyp/switch.c | 25 +++++++++++++------------ 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ea6cb5b24258..7ee72b402907 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -192,6 +192,8 @@ struct kvm_cpu_context { u64 sys_regs[NR_SYS_REGS]; u32 copro[NR_COPRO_REGS]; }; + + struct kvm_vcpu *__hyp_running_vcpu; }; typedef struct kvm_cpu_context kvm_cpu_context_t; diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index fce7cc507e0a..e4f37b9dd47c 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -163,6 +163,18 @@ ENTRY(__hyp_do_panic) eret ENDPROC(__hyp_do_panic) +ENTRY(__hyp_panic) + /* + * '=kvm_host_cpu_state' is a host VA from the constant pool, it may + * not be accessible by this address from EL2, hyp_panic() converts + * it with kern_hyp_va() before use. + */ + ldr x0, =kvm_host_cpu_state + mrs x1, tpidr_el2 + add x0, x0, x1 + b hyp_panic +ENDPROC(__hyp_panic) + .macro invalid_vector label, target = __hyp_panic .align 2 \label: diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index 603e1ee83e89..74306c33a6de 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -86,5 +86,8 @@ u32 __hyp_text __init_stage2_translation(void) write_sysreg(val, vtcr_el2); + /* copy tpidr_el1 into tpidr_el2 for use by HYP */ + write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); + return parange; } diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 170e1917f83c..324f4202cdd5 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -306,9 +306,9 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) u64 exit_code; vcpu = kern_hyp_va(vcpu); - write_sysreg(vcpu, tpidr_el2); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; __sysreg_save_host_state(host_ctxt); @@ -443,7 +443,8 @@ again: static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; -static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) +static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par, + struct kvm_vcpu *vcpu) { unsigned long str_va; @@ -457,35 +458,35 @@ static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) __hyp_do_panic(str_va, spsr, elr, read_sysreg(esr_el2), read_sysreg_el2(far), - read_sysreg(hpfar_el2), par, - (void *)read_sysreg(tpidr_el2)); + read_sysreg(hpfar_el2), par, vcpu); } -static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par) +static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par, + struct kvm_vcpu *vcpu) { panic(__hyp_panic_string, spsr, elr, read_sysreg_el2(esr), read_sysreg_el2(far), - read_sysreg(hpfar_el2), par, - (void *)read_sysreg(tpidr_el2)); + read_sysreg(hpfar_el2), par, vcpu); } static hyp_alternate_select(__hyp_call_panic, __hyp_call_panic_nvhe, __hyp_call_panic_vhe, ARM64_HAS_VIRT_HOST_EXTN); -void __hyp_text __noreturn __hyp_panic(void) +void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *__host_ctxt) { + struct kvm_vcpu *vcpu = NULL; + u64 spsr = read_sysreg_el2(spsr); u64 elr = read_sysreg_el2(elr); u64 par = read_sysreg(par_el1); if (read_sysreg(vttbr_el2)) { - struct kvm_vcpu *vcpu; struct kvm_cpu_context *host_ctxt; - vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); - host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + host_ctxt = kern_hyp_va(__host_ctxt); + vcpu = host_ctxt->__hyp_running_vcpu; __timer_disable_traps(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); @@ -493,7 +494,7 @@ void __hyp_text __noreturn __hyp_panic(void) } /* Call panic for real */ - __hyp_call_panic()(spsr, elr, par); + __hyp_call_panic()(spsr, elr, par, vcpu); unreachable(); } From 6d99b68933fbcf51f84fcbba49246ce1209ec193 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:06 +0000 Subject: [PATCH 058/104] arm64: alternatives: use tpidr_el2 on VHE hosts Now that KVM uses tpidr_el2 in the same way as Linux's cpu_offset in tpidr_el1, merge the two. This saves KVM from save/restoring tpidr_el1 on VHE hosts, and allows future code to blindly access per-cpu variables without triggering world-switch. Signed-off-by: James Morse Reviewed-by: Christoffer Dall Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/alternative.h | 2 ++ arch/arm64/include/asm/assembler.h | 8 ++++++++ arch/arm64/include/asm/percpu.h | 11 +++++++++-- arch/arm64/kernel/alternative.c | 9 +++++---- arch/arm64/kernel/cpufeature.c | 17 +++++++++++++++++ arch/arm64/mm/proc.S | 8 ++++++++ 6 files changed, 49 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 4a85c6952a22..669028172fd6 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -12,6 +12,8 @@ #include #include +extern int alternatives_applied; + struct alt_instr { s32 orig_offset; /* offset to original instruction */ s32 alt_offset; /* offset to replacement instruction */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index a6f90b648655..5dc4856f3bb9 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -254,7 +254,11 @@ lr .req x30 // link register #else adr_l \dst, \sym #endif +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN mrs \tmp, tpidr_el1 +alternative_else + mrs \tmp, tpidr_el2 +alternative_endif add \dst, \dst, \tmp .endm @@ -265,7 +269,11 @@ lr .req x30 // link register */ .macro ldr_this_cpu dst, sym, tmp adr_l \dst, \sym +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN mrs \tmp, tpidr_el1 +alternative_else + mrs \tmp, tpidr_el2 +alternative_endif ldr \dst, [\dst, \tmp] .endm diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 3bd498e4de4c..43393208229e 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -16,11 +16,15 @@ #ifndef __ASM_PERCPU_H #define __ASM_PERCPU_H +#include #include static inline void set_my_cpu_offset(unsigned long off) { - asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory"); + asm volatile(ALTERNATIVE("msr tpidr_el1, %0", + "msr tpidr_el2, %0", + ARM64_HAS_VIRT_HOST_EXTN) + :: "r" (off) : "memory"); } static inline unsigned long __my_cpu_offset(void) @@ -31,7 +35,10 @@ static inline unsigned long __my_cpu_offset(void) * We want to allow caching the value, so avoid using volatile and * instead use a fake stack read to hazard against barrier(). */ - asm("mrs %0, tpidr_el1" : "=r" (off) : + asm(ALTERNATIVE("mrs %0, tpidr_el1", + "mrs %0, tpidr_el2", + ARM64_HAS_VIRT_HOST_EXTN) + : "=r" (off) : "Q" (*(const unsigned long *)current_stack_pointer)); return off; diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 6dd0a3a3e5c9..414288a558c8 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -32,6 +32,8 @@ #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) +int alternatives_applied; + struct alt_region { struct alt_instr *begin; struct alt_instr *end; @@ -143,7 +145,6 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias) */ static int __apply_alternatives_multi_stop(void *unused) { - static int patched = 0; struct alt_region region = { .begin = (struct alt_instr *)__alt_instructions, .end = (struct alt_instr *)__alt_instructions_end, @@ -151,14 +152,14 @@ static int __apply_alternatives_multi_stop(void *unused) /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(patched)) + while (!READ_ONCE(alternatives_applied)) cpu_relax(); isb(); } else { - BUG_ON(patched); + BUG_ON(alternatives_applied); __apply_alternatives(®ion, true); /* Barriers provided by the cache flushing */ - WRITE_ONCE(patched, 1); + WRITE_ONCE(alternatives_applied, 1); } return 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index da6722db50b0..9ef84d0def9a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -886,6 +886,22 @@ static int __init parse_kpti(char *str) __setup("kpti=", parse_kpti); #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +static int cpu_copy_el2regs(void *__unused) +{ + /* + * Copy register values that aren't redirected by hardware. + * + * Before code patching, we only set tpidr_el1, all CPUs need to copy + * this value to tpidr_el2 before we patch the code. Once we've done + * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to + * do anything here. + */ + if (!alternatives_applied) + write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); + + return 0; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -955,6 +971,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_VIRT_HOST_EXTN, .def_scope = SCOPE_SYSTEM, .matches = runs_at_el2, + .enable = cpu_copy_el2regs, }, { .desc = "32-bit EL0 Support", diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index bc86f7ef8620..5a59eea49395 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -70,7 +70,11 @@ ENTRY(cpu_do_suspend) mrs x8, mdscr_el1 mrs x9, oslsr_el1 mrs x10, sctlr_el1 +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN mrs x11, tpidr_el1 +alternative_else + mrs x11, tpidr_el2 +alternative_endif mrs x12, sp_el0 stp x2, x3, [x0] stp x4, xzr, [x0, #16] @@ -116,7 +120,11 @@ ENTRY(cpu_do_resume) msr mdscr_el1, x10 msr sctlr_el1, x12 +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN msr tpidr_el1, x13 +alternative_else + msr tpidr_el2, x13 +alternative_endif msr sp_el0, x14 /* * Restore oslsr_el1 by writing oslar_el1 From 1f742679c33bc083722cb0b442a95d458c491b56 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:07 +0000 Subject: [PATCH 059/104] KVM: arm64: Stop save/restoring host tpidr_el1 on VHE Now that a VHE host uses tpidr_el2 for the cpu offset we no longer need KVM to save/restore tpidr_el1. Move this from the 'common' code into the non-vhe code. While we're at it, on VHE we don't need to save the ELR or SPSR as kernel_entry in entry.S will have pushed these onto the kernel stack, and will restore them from there. Move these to the non-vhe code as we need them to get back to the host. Finally remove the always-copy-tpidr we hid in the stage2 setup code, cpufeature's enable callback will do this for VHE, we only need KVM to do it for non-vhe. Add the copy into kvm-init instead. Signed-off-by: James Morse Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/kvm/hyp-init.S | 4 ++++ arch/arm64/kvm/hyp/s2-setup.c | 3 --- arch/arm64/kvm/hyp/sysreg-sr.c | 16 ++++++++-------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 33c40b3eea01..8a00de187e56 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -120,6 +120,10 @@ CPU_BE( orr x4, x4, #SCTLR_ELx_EE) kern_hyp_va x2 msr vbar_el2, x2 + /* copy tpidr_el1 into tpidr_el2 for use by HYP */ + mrs x1, tpidr_el1 + msr tpidr_el2, x1 + /* Hello, World! */ eret ENDPROC(__kvm_hyp_init) diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index 74306c33a6de..603e1ee83e89 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -86,8 +86,5 @@ u32 __hyp_text __init_stage2_translation(void) write_sysreg(val, vtcr_el2); - /* copy tpidr_el1 into tpidr_el2 for use by HYP */ - write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); - return parange; } diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 934137647837..c54cc2afb92b 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -27,8 +27,8 @@ static void __hyp_text __sysreg_do_nothing(struct kvm_cpu_context *ctxt) { } /* * Non-VHE: Both host and guest must save everything. * - * VHE: Host must save tpidr*_el[01], actlr_el1, mdscr_el1, sp0, pc, - * pstate, and guest must save everything. + * VHE: Host must save tpidr*_el0, actlr_el1, mdscr_el1, sp_el0, + * and guest must save everything. */ static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt) @@ -36,11 +36,8 @@ static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt) ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); ctxt->sys_regs[TPIDR_EL0] = read_sysreg(tpidr_el0); ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0); - ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1); ctxt->gp_regs.regs.sp = read_sysreg(sp_el0); - ctxt->gp_regs.regs.pc = read_sysreg_el2(elr); - ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr); } static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) @@ -62,10 +59,13 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(amair); ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(cntkctl); ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); + ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); ctxt->gp_regs.elr_el1 = read_sysreg_el1(elr); ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr); + ctxt->gp_regs.regs.pc = read_sysreg_el2(elr); + ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr); } static hyp_alternate_select(__sysreg_call_save_host_state, @@ -89,11 +89,8 @@ static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctx write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0); write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); - write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1); write_sysreg(ctxt->gp_regs.regs.sp, sp_el0); - write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); } static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt) @@ -115,10 +112,13 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt->sys_regs[AMAIR_EL1], amair); write_sysreg_el1(ctxt->sys_regs[CNTKCTL_EL1], cntkctl); write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); + write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); write_sysreg(ctxt->gp_regs.sp_el1, sp_el1); write_sysreg_el1(ctxt->gp_regs.elr_el1, elr); write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr); + write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); + write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); } static hyp_alternate_select(__sysreg_call_restore_host_state, From 86f04f640058143388f56c048f86e66ea5204ae2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:08 +0000 Subject: [PATCH 060/104] Docs: dt: add devicetree binding for describing arm64 SDEI firmware The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications, or from an IRQ that has been promoted to a firmware-assisted NMI. Add a new devicetree binding to describe the SDE firmware interface. Signed-off-by: James Morse Acked-by: Rob Herring Signed-off-by: Catalin Marinas --- .../devicetree/bindings/arm/firmware/sdei.txt | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/firmware/sdei.txt diff --git a/Documentation/devicetree/bindings/arm/firmware/sdei.txt b/Documentation/devicetree/bindings/arm/firmware/sdei.txt new file mode 100644 index 000000000000..ee3f0ff49889 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/firmware/sdei.txt @@ -0,0 +1,42 @@ +* Software Delegated Exception Interface (SDEI) + +Firmware implementing the SDEI functions described in ARM document number +ARM DEN 0054A ("Software Delegated Exception Interface") can be used by +Linux to receive notification of events such as those generated by +firmware-first error handling, or from an IRQ that has been promoted to +a firmware-assisted NMI. + +The interface provides a number of API functions for registering callbacks +and enabling/disabling events. Functions are invoked by trapping to the +privilege level of the SDEI firmware (specified as part of the binding +below) and passing arguments in a manner specified by the "SMC Calling +Convention (ARM DEN 0028B): + + r0 => 32-bit Function ID / return value + {r1 - r3} => Parameters + +Note that the immediate field of the trapping instruction must be set +to #0. + +The SDEI_EVENT_REGISTER function registers a callback in the kernel +text to handle the specified event number. + +The sdei node should be a child node of '/firmware' and have required +properties: + + - compatible : should contain: + * "arm,sdei-1.0" : For implementations complying to SDEI version 1.x. + + - method : The method of calling the SDEI firmware. Permitted + values are: + * "smc" : SMC #0, with the register assignments specified in this + binding. + * "hvc" : HVC #0, with the register assignments specified in this + binding. +Example: + firmware { + sdei { + compatible = "arm,sdei-1.0"; + method = "smc"; + }; + }; From ad6eb31ef90355993eb55ff77e0e855ae7d91e4c Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:09 +0000 Subject: [PATCH 061/104] firmware: arm_sdei: Add driver for Software Delegated Exceptions The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement firmware notifications (such as firmware-first RAS) or promote an IRQ that has been promoted to a firmware-assisted NMI. Add the code for detecting the SDEI version and the framework for registering and unregistering events. Subsequent patches will add the arch-specific backend code and the necessary power management hooks. Only shared events are supported, power management, private events and discovery for ACPI systems will be added by later patches. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- MAINTAINERS | 9 + arch/arm64/include/asm/sdei.h | 8 + drivers/firmware/Kconfig | 8 + drivers/firmware/Makefile | 1 + drivers/firmware/arm_sdei.c | 619 ++++++++++++++++++++++++++++++++++ include/linux/arm_sdei.h | 79 +++++ include/uapi/linux/arm_sdei.h | 73 ++++ 7 files changed, 797 insertions(+) create mode 100644 arch/arm64/include/asm/sdei.h create mode 100644 drivers/firmware/arm_sdei.c create mode 100644 include/linux/arm_sdei.h create mode 100644 include/uapi/linux/arm_sdei.h diff --git a/MAINTAINERS b/MAINTAINERS index 82ad0eabce4f..c06885c4c0f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12618,6 +12618,15 @@ L: linux-media@vger.kernel.org S: Supported F: drivers/media/pci/solo6x10/ +SOFTWARE DELEGATED EXCEPTION INTERFACE (SDEI) +M: James Morse +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: Documentation/devicetree/bindings/arm/firmware/sdei.txt +F: drivers/firmware/arm_sdei.c +F: include/linux/sdei.h +F: include/uapi/linux/sdei.h + SOFTWARE RAID (Multiple Disks) SUPPORT M: Shaohua Li L: linux-raid@vger.kernel.org diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h new file mode 100644 index 000000000000..59f26b6e673d --- /dev/null +++ b/arch/arm64/include/asm/sdei.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#ifndef __ASM_SDEI_H +#define __ASM_SDEI_H + +/* Later patches add the arch specific bits */ + +#endif /* __ASM_SDEI_H */ diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index fa87a055905e..e77f77caa0f3 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -48,6 +48,14 @@ config ARM_SCPI_POWER_DOMAIN This enables support for the SCPI power domains which can be enabled or disabled via the SCP firmware +config ARM_SDE_INTERFACE + bool "ARM Software Delegated Exception Interface (SDEI)" + depends on ARM64 + help + The Software Delegated Exception Interface (SDEI) is an ARM + standard for registering callbacks from the platform firmware + into the OS. This is typically used to implement RAS notifications. + config EDD tristate "BIOS Enhanced Disk Drive calls determine boot disk" depends on X86 diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index feaa890197f3..b248238ddc6a 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_ARM_PSCI_FW) += psci.o obj-$(CONFIG_ARM_PSCI_CHECKER) += psci_checker.o obj-$(CONFIG_ARM_SCPI_PROTOCOL) += arm_scpi.o obj-$(CONFIG_ARM_SCPI_POWER_DOMAIN) += scpi_pm_domain.o +obj-$(CONFIG_ARM_SDE_INTERFACE) += arm_sdei.o obj-$(CONFIG_DMI) += dmi_scan.o obj-$(CONFIG_DMI_SYSFS) += dmi-sysfs.o obj-$(CONFIG_EDD) += edd.o diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c new file mode 100644 index 000000000000..8da173cc7e43 --- /dev/null +++ b/drivers/firmware/arm_sdei.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#define pr_fmt(fmt) "sdei: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The call to use to reach the firmware. + */ +static asmlinkage void (*sdei_firmware_call)(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, struct arm_smccc_res *res); + +/* entry point from firmware to arch asm code */ +static unsigned long sdei_entry_point; + +struct sdei_event { + struct list_head list; + u32 event_num; + u8 type; + u8 priority; + + /* This pointer is handed to firmware as the event argument. */ + struct sdei_registered_event *registered; +}; + +/* Take the mutex for any API call or modification. Take the mutex first. */ +static DEFINE_MUTEX(sdei_events_lock); + +/* and then hold this when modifying the list */ +static DEFINE_SPINLOCK(sdei_list_lock); +static LIST_HEAD(sdei_list); + +static int sdei_to_linux_errno(unsigned long sdei_err) +{ + switch (sdei_err) { + case SDEI_NOT_SUPPORTED: + return -EOPNOTSUPP; + case SDEI_INVALID_PARAMETERS: + return -EINVAL; + case SDEI_DENIED: + return -EPERM; + case SDEI_PENDING: + return -EINPROGRESS; + case SDEI_OUT_OF_RESOURCE: + return -ENOMEM; + } + + /* Not an error value ... */ + return sdei_err; +} + +/* + * If x0 is any of these values, then the call failed, use sdei_to_linux_errno() + * to translate. + */ +static int sdei_is_err(struct arm_smccc_res *res) +{ + switch (res->a0) { + case SDEI_NOT_SUPPORTED: + case SDEI_INVALID_PARAMETERS: + case SDEI_DENIED: + case SDEI_PENDING: + case SDEI_OUT_OF_RESOURCE: + return true; + } + + return false; +} + +static int invoke_sdei_fn(unsigned long function_id, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + u64 *result) +{ + int err = 0; + struct arm_smccc_res res; + + if (sdei_firmware_call) { + sdei_firmware_call(function_id, arg0, arg1, arg2, arg3, arg4, + &res); + if (sdei_is_err(&res)) + err = sdei_to_linux_errno(res.a0); + } else { + /* + * !sdei_firmware_call means we failed to probe or called + * sdei_mark_interface_broken(). -EIO is not an error returned + * by sdei_to_linux_errno() and is used to suppress messages + * from this driver. + */ + err = -EIO; + res.a0 = SDEI_NOT_SUPPORTED; + } + + if (result) + *result = res.a0; + + return err; +} + +static struct sdei_event *sdei_event_find(u32 event_num) +{ + struct sdei_event *e, *found = NULL; + + lockdep_assert_held(&sdei_events_lock); + + spin_lock(&sdei_list_lock); + list_for_each_entry(e, &sdei_list, list) { + if (e->event_num == event_num) { + found = e; + break; + } + } + spin_unlock(&sdei_list_lock); + + return found; +} + +int sdei_api_event_context(u32 query, u64 *result) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_CONTEXT, query, 0, 0, 0, 0, + result); +} +NOKPROBE_SYMBOL(sdei_api_event_context); + +static int sdei_api_event_get_info(u32 event, u32 info, u64 *result) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_GET_INFO, event, info, 0, + 0, 0, result); +} + +static struct sdei_event *sdei_event_create(u32 event_num, + sdei_event_callback *cb, + void *cb_arg) +{ + int err; + u64 result; + struct sdei_event *event; + struct sdei_registered_event *reg; + + lockdep_assert_held(&sdei_events_lock); + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&event->list); + event->event_num = event_num; + + err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_PRIORITY, + &result); + if (err) { + kfree(event); + return ERR_PTR(err); + } + event->priority = result; + + err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_TYPE, + &result); + if (err) { + kfree(event); + return ERR_PTR(err); + } + event->type = result; + + if (event->type == SDEI_EVENT_TYPE_SHARED) { + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) { + kfree(event); + return ERR_PTR(-ENOMEM); + } + + reg->event_num = event_num; + reg->priority = event->priority; + + reg->callback = cb; + reg->callback_arg = cb_arg; + event->registered = reg; + } + + if (sdei_event_find(event_num)) { + kfree(event->registered); + kfree(event); + event = ERR_PTR(-EBUSY); + } else { + spin_lock(&sdei_list_lock); + list_add(&event->list, &sdei_list); + spin_unlock(&sdei_list_lock); + } + + return event; +} + +static void sdei_event_destroy(struct sdei_event *event) +{ + lockdep_assert_held(&sdei_events_lock); + + spin_lock(&sdei_list_lock); + list_del(&event->list); + spin_unlock(&sdei_list_lock); + + if (event->type == SDEI_EVENT_TYPE_SHARED) + kfree(event->registered); + + kfree(event); +} + +static int sdei_api_get_version(u64 *version) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_VERSION, 0, 0, 0, 0, 0, version); +} + +int sdei_mask_local_cpu(void) +{ + int err; + + WARN_ON_ONCE(preemptible()); + + err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_MASK, 0, 0, 0, 0, 0, NULL); + if (err && err != -EIO) { + pr_warn_once("failed to mask CPU[%u]: %d\n", + smp_processor_id(), err); + return err; + } + + return 0; +} + +static void _ipi_mask_cpu(void *ignored) +{ + sdei_mask_local_cpu(); +} + +int sdei_unmask_local_cpu(void) +{ + int err; + + WARN_ON_ONCE(preemptible()); + + err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0, NULL); + if (err && err != -EIO) { + pr_warn_once("failed to unmask CPU[%u]: %d\n", + smp_processor_id(), err); + return err; + } + + return 0; +} + +static void _ipi_unmask_cpu(void *ignored) +{ + sdei_unmask_local_cpu(); +} + +static void _ipi_private_reset(void *ignored) +{ + int err; + + err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0, + NULL); + if (err && err != -EIO) + pr_warn_once("failed to reset CPU[%u]: %d\n", + smp_processor_id(), err); +} + +static int sdei_api_shared_reset(void) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_SHARED_RESET, 0, 0, 0, 0, 0, + NULL); +} + +static void sdei_mark_interface_broken(void) +{ + pr_err("disabling SDEI firmware interface\n"); + on_each_cpu(&_ipi_mask_cpu, NULL, true); + sdei_firmware_call = NULL; +} + +static int sdei_platform_reset(void) +{ + int err; + + on_each_cpu(&_ipi_private_reset, NULL, true); + err = sdei_api_shared_reset(); + if (err) { + pr_err("Failed to reset platform: %d\n", err); + sdei_mark_interface_broken(); + } + + return err; +} + +static int sdei_api_event_enable(u32 event_num) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_ENABLE, event_num, 0, 0, 0, + 0, NULL); +} + +int sdei_event_enable(u32 event_num) +{ + int err = -EINVAL; + struct sdei_event *event; + + mutex_lock(&sdei_events_lock); + event = sdei_event_find(event_num); + if (!event) { + mutex_unlock(&sdei_events_lock); + return -ENOENT; + } + + if (event->type == SDEI_EVENT_TYPE_SHARED) + err = sdei_api_event_enable(event->event_num); + mutex_unlock(&sdei_events_lock); + + return err; +} +EXPORT_SYMBOL(sdei_event_enable); + +static int sdei_api_event_disable(u32 event_num) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_DISABLE, event_num, 0, 0, + 0, 0, NULL); +} + +int sdei_event_disable(u32 event_num) +{ + int err = -EINVAL; + struct sdei_event *event; + + mutex_lock(&sdei_events_lock); + event = sdei_event_find(event_num); + if (!event) { + mutex_unlock(&sdei_events_lock); + return -ENOENT; + } + + if (event->type == SDEI_EVENT_TYPE_SHARED) + err = sdei_api_event_disable(event->event_num); + mutex_unlock(&sdei_events_lock); + + return err; +} +EXPORT_SYMBOL(sdei_event_disable); + +static int sdei_api_event_unregister(u32 event_num) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_UNREGISTER, event_num, 0, + 0, 0, 0, NULL); +} + +static int _sdei_event_unregister(struct sdei_event *event) +{ + lockdep_assert_held(&sdei_events_lock); + + if (event->type == SDEI_EVENT_TYPE_SHARED) + return sdei_api_event_unregister(event->event_num); + + return -EINVAL; +} + +int sdei_event_unregister(u32 event_num) +{ + int err; + struct sdei_event *event; + + WARN_ON(in_nmi()); + + mutex_lock(&sdei_events_lock); + event = sdei_event_find(event_num); + do { + if (!event) { + pr_warn("Event %u not registered\n", event_num); + err = -ENOENT; + break; + } + + err = _sdei_event_unregister(event); + if (err) + break; + + sdei_event_destroy(event); + } while (0); + mutex_unlock(&sdei_events_lock); + + return err; +} +EXPORT_SYMBOL(sdei_event_unregister); + +static int sdei_api_event_register(u32 event_num, unsigned long entry_point, + void *arg, u64 flags, u64 affinity) +{ + return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_REGISTER, event_num, + (unsigned long)entry_point, (unsigned long)arg, + flags, affinity, NULL); +} + +static int _sdei_event_register(struct sdei_event *event) +{ + lockdep_assert_held(&sdei_events_lock); + + if (event->type == SDEI_EVENT_TYPE_SHARED) + return sdei_api_event_register(event->event_num, + sdei_entry_point, + event->registered, + SDEI_EVENT_REGISTER_RM_ANY, 0); + + return -EINVAL; +} + +int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) +{ + int err; + struct sdei_event *event; + + WARN_ON(in_nmi()); + + mutex_lock(&sdei_events_lock); + do { + if (sdei_event_find(event_num)) { + pr_warn("Event %u already registered\n", event_num); + err = -EBUSY; + break; + } + + event = sdei_event_create(event_num, cb, arg); + if (IS_ERR(event)) { + err = PTR_ERR(event); + pr_warn("Failed to create event %u: %d\n", event_num, + err); + break; + } + + err = _sdei_event_register(event); + if (err) { + sdei_event_destroy(event); + pr_warn("Failed to register event %u: %d\n", event_num, + err); + } + } while (0); + mutex_unlock(&sdei_events_lock); + + return err; +} +EXPORT_SYMBOL(sdei_event_register); + +static void sdei_smccc_smc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, struct arm_smccc_res *res) +{ + arm_smccc_smc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res); +} + +static void sdei_smccc_hvc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, struct arm_smccc_res *res) +{ + arm_smccc_hvc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res); +} + +static int sdei_get_conduit(struct platform_device *pdev) +{ + const char *method; + struct device_node *np = pdev->dev.of_node; + + sdei_firmware_call = NULL; + if (np) { + if (of_property_read_string(np, "method", &method)) { + pr_warn("missing \"method\" property\n"); + return CONDUIT_INVALID; + } + + if (!strcmp("hvc", method)) { + sdei_firmware_call = &sdei_smccc_hvc; + return CONDUIT_HVC; + } else if (!strcmp("smc", method)) { + sdei_firmware_call = &sdei_smccc_smc; + return CONDUIT_SMC; + } + + pr_warn("invalid \"method\" property: %s\n", method); + } + + return CONDUIT_INVALID; +} + +static int sdei_probe(struct platform_device *pdev) +{ + int err; + u64 ver = 0; + int conduit; + + conduit = sdei_get_conduit(pdev); + if (!sdei_firmware_call) + return 0; + + err = sdei_api_get_version(&ver); + if (err == -EOPNOTSUPP) + pr_err("advertised but not implemented in platform firmware\n"); + if (err) { + pr_err("Failed to get SDEI version: %d\n", err); + sdei_mark_interface_broken(); + return err; + } + + pr_info("SDEIv%d.%d (0x%x) detected in firmware.\n", + (int)SDEI_VERSION_MAJOR(ver), (int)SDEI_VERSION_MINOR(ver), + (int)SDEI_VERSION_VENDOR(ver)); + + if (SDEI_VERSION_MAJOR(ver) != 1) { + pr_warn("Conflicting SDEI version detected.\n"); + sdei_mark_interface_broken(); + return -EINVAL; + } + + err = sdei_platform_reset(); + if (err) + return err; + + sdei_entry_point = sdei_arch_get_entry_point(conduit); + if (!sdei_entry_point) { + /* Not supported due to hardware or boot configuration */ + sdei_mark_interface_broken(); + return 0; + } + + on_each_cpu(&_ipi_unmask_cpu, NULL, false); + + return 0; +} + +static const struct of_device_id sdei_of_match[] = { + { .compatible = "arm,sdei-1.0" }, + {} +}; + +static struct platform_driver sdei_driver = { + .driver = { + .name = "sdei", + .of_match_table = sdei_of_match, + }, + .probe = sdei_probe, +}; + +static bool __init sdei_present_dt(void) +{ + struct platform_device *pdev; + struct device_node *np, *fw_np; + + fw_np = of_find_node_by_name(NULL, "firmware"); + if (!fw_np) + return false; + + np = of_find_matching_node(fw_np, sdei_of_match); + of_node_put(fw_np); + if (!np) + return false; + + pdev = of_platform_device_create(np, sdei_driver.driver.name, NULL); + of_node_put(np); + if (IS_ERR(pdev)) + return false; + + return true; +} + +static int __init sdei_init(void) +{ + if (sdei_present_dt()) + platform_driver_register(&sdei_driver); + + return 0; +} + +subsys_initcall_sync(sdei_init); + +int sdei_event_handler(struct pt_regs *regs, + struct sdei_registered_event *arg) +{ + int err; + mm_segment_t orig_addr_limit; + u32 event_num = arg->event_num; + + orig_addr_limit = get_fs(); + set_fs(USER_DS); + + err = arg->callback(event_num, regs, arg->callback_arg); + if (err) + pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", + event_num, smp_processor_id(), err); + + set_fs(orig_addr_limit); + + return err; +} +NOKPROBE_SYMBOL(sdei_event_handler); diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h new file mode 100644 index 000000000000..942afbd544b7 --- /dev/null +++ b/include/linux/arm_sdei.h @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#ifndef __LINUX_ARM_SDEI_H +#define __LINUX_ARM_SDEI_H + +#include + +enum sdei_conduit_types { + CONDUIT_INVALID = 0, + CONDUIT_SMC, + CONDUIT_HVC, +}; + +#include + +/* Arch code should override this to set the entry point from firmware... */ +#ifndef sdei_arch_get_entry_point +#define sdei_arch_get_entry_point(conduit) (0) +#endif + +/* + * When an event occurs sdei_event_handler() will call a user-provided callback + * like this in NMI context on the CPU that received the event. + */ +typedef int (sdei_event_callback)(u32 event, struct pt_regs *regs, void *arg); + +/* + * Register your callback to claim an event. The event must be described + * by firmware. + */ +int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg); + +/* + * Calls to sdei_event_unregister() may return EINPROGRESS. Keep calling + * it until it succeeds. + */ +int sdei_event_unregister(u32 event_num); + +int sdei_event_enable(u32 event_num); +int sdei_event_disable(u32 event_num); + +#ifdef CONFIG_ARM_SDE_INTERFACE +/* For use by arch code when CPU hotplug notifiers are not appropriate. */ +int sdei_mask_local_cpu(void); +int sdei_unmask_local_cpu(void); +#else +static inline int sdei_mask_local_cpu(void) { return 0; } +static inline int sdei_unmask_local_cpu(void) { return 0; } +#endif /* CONFIG_ARM_SDE_INTERFACE */ + + +/* + * This struct represents an event that has been registered. The driver + * maintains a list of all events, and which ones are registered. (Private + * events have one entry in the list, but are registered on each CPU). + * A pointer to this struct is passed to firmware, and back to the event + * handler. The event handler can then use this to invoke the registered + * callback, without having to walk the list. + * + * For CPU private events, this structure is per-cpu. + */ +struct sdei_registered_event { + /* For use by arch code: */ + struct pt_regs interrupted_regs; + + sdei_event_callback *callback; + void *callback_arg; + u32 event_num; + u8 priority; +}; + +/* The arch code entry point should then call this when an event arrives. */ +int notrace sdei_event_handler(struct pt_regs *regs, + struct sdei_registered_event *arg); + +/* arch code may use this to retrieve the extra registers. */ +int sdei_api_event_context(u32 query, u64 *result); + +#endif /* __LINUX_ARM_SDEI_H */ diff --git a/include/uapi/linux/arm_sdei.h b/include/uapi/linux/arm_sdei.h new file mode 100644 index 000000000000..af0630ba5437 --- /dev/null +++ b/include/uapi/linux/arm_sdei.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (C) 2017 Arm Ltd. */ +#ifndef _UAPI_LINUX_ARM_SDEI_H +#define _UAPI_LINUX_ARM_SDEI_H + +#define SDEI_1_0_FN_BASE 0xC4000020 +#define SDEI_1_0_MASK 0xFFFFFFE0 +#define SDEI_1_0_FN(n) (SDEI_1_0_FN_BASE + (n)) + +#define SDEI_1_0_FN_SDEI_VERSION SDEI_1_0_FN(0x00) +#define SDEI_1_0_FN_SDEI_EVENT_REGISTER SDEI_1_0_FN(0x01) +#define SDEI_1_0_FN_SDEI_EVENT_ENABLE SDEI_1_0_FN(0x02) +#define SDEI_1_0_FN_SDEI_EVENT_DISABLE SDEI_1_0_FN(0x03) +#define SDEI_1_0_FN_SDEI_EVENT_CONTEXT SDEI_1_0_FN(0x04) +#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE SDEI_1_0_FN(0x05) +#define SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME SDEI_1_0_FN(0x06) +#define SDEI_1_0_FN_SDEI_EVENT_UNREGISTER SDEI_1_0_FN(0x07) +#define SDEI_1_0_FN_SDEI_EVENT_STATUS SDEI_1_0_FN(0x08) +#define SDEI_1_0_FN_SDEI_EVENT_GET_INFO SDEI_1_0_FN(0x09) +#define SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET SDEI_1_0_FN(0x0A) +#define SDEI_1_0_FN_SDEI_PE_MASK SDEI_1_0_FN(0x0B) +#define SDEI_1_0_FN_SDEI_PE_UNMASK SDEI_1_0_FN(0x0C) +#define SDEI_1_0_FN_SDEI_INTERRUPT_BIND SDEI_1_0_FN(0x0D) +#define SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE SDEI_1_0_FN(0x0E) +#define SDEI_1_0_FN_SDEI_PRIVATE_RESET SDEI_1_0_FN(0x11) +#define SDEI_1_0_FN_SDEI_SHARED_RESET SDEI_1_0_FN(0x12) + +#define SDEI_VERSION_MAJOR_SHIFT 48 +#define SDEI_VERSION_MAJOR_MASK 0x7fff +#define SDEI_VERSION_MINOR_SHIFT 32 +#define SDEI_VERSION_MINOR_MASK 0xffff +#define SDEI_VERSION_VENDOR_SHIFT 0 +#define SDEI_VERSION_VENDOR_MASK 0xffffffff + +#define SDEI_VERSION_MAJOR(x) (x>>SDEI_VERSION_MAJOR_SHIFT & SDEI_VERSION_MAJOR_MASK) +#define SDEI_VERSION_MINOR(x) (x>>SDEI_VERSION_MINOR_SHIFT & SDEI_VERSION_MINOR_MASK) +#define SDEI_VERSION_VENDOR(x) (x>>SDEI_VERSION_VENDOR_SHIFT & SDEI_VERSION_VENDOR_MASK) + +/* SDEI return values */ +#define SDEI_SUCCESS 0 +#define SDEI_NOT_SUPPORTED -1 +#define SDEI_INVALID_PARAMETERS -2 +#define SDEI_DENIED -3 +#define SDEI_PENDING -5 +#define SDEI_OUT_OF_RESOURCE -10 + +/* EVENT_REGISTER flags */ +#define SDEI_EVENT_REGISTER_RM_ANY 0 +#define SDEI_EVENT_REGISTER_RM_PE 1 + +/* EVENT_STATUS return value bits */ +#define SDEI_EVENT_STATUS_RUNNING 2 +#define SDEI_EVENT_STATUS_ENABLED 1 +#define SDEI_EVENT_STATUS_REGISTERED 0 + +/* EVENT_COMPLETE status values */ +#define SDEI_EV_HANDLED 0 +#define SDEI_EV_FAILED 1 + +/* GET_INFO values */ +#define SDEI_EVENT_INFO_EV_TYPE 0 +#define SDEI_EVENT_INFO_EV_SIGNALED 1 +#define SDEI_EVENT_INFO_EV_PRIORITY 2 +#define SDEI_EVENT_INFO_EV_ROUTING_MODE 3 +#define SDEI_EVENT_INFO_EV_ROUTING_AFF 4 + +/* and their results */ +#define SDEI_EVENT_TYPE_PRIVATE 0 +#define SDEI_EVENT_TYPE_SHARED 1 +#define SDEI_EVENT_PRIORITY_NORMAL 0 +#define SDEI_EVENT_PRIORITY_CRITICAL 1 + +#endif /* _UAPI_LINUX_ARM_SDEI_H */ From ed8b20d457d72e9e2a30533b436fdb4ea1c70b38 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:10 +0000 Subject: [PATCH 062/104] arm64: Add vmap_stack header file Today the arm64 arch code allocates an extra IRQ stack per-cpu. If we also have SDEI and VMAP stacks we need two extra per-cpu VMAP stacks. Move the VMAP stack allocation out to a helper in a new header file. This avoids missing THREADINFO_GFP, or getting the all-important alignment wrong. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/vmap_stack.h | 28 ++++++++++++++++++++++++++++ arch/arm64/kernel/irq.c | 13 ++----------- 2 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/vmap_stack.h diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h new file mode 100644 index 000000000000..0b5ec6e08c10 --- /dev/null +++ b/arch/arm64/include/asm/vmap_stack.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#ifndef __ASM_VMAP_STACK_H +#define __ASM_VMAP_STACK_H + +#include +#include +#include +#include +#include +#include +#include + +/* + * To ensure that VMAP'd stack overflow detection works correctly, all VMAP'd + * stacks need to have the same alignment. + */ +static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) +{ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); + + return __vmalloc_node_range(stack_size, THREAD_ALIGN, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP, PAGE_KERNEL, 0, node, + __builtin_return_address(0)); +} + +#endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 713561e5bcab..60e5fc661f74 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -29,6 +29,7 @@ #include #include #include +#include unsigned long irq_err_count; @@ -58,17 +59,7 @@ static void init_irq_stacks(void) unsigned long *p; for_each_possible_cpu(cpu) { - /* - * To ensure that VMAP'd stack overflow detection works - * correctly, the IRQ stacks need to have the same - * alignment as other stacks. - */ - p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, - 0, cpu_to_node(cpu), - __builtin_return_address(0)); - + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } From e1281f56f114f3a945bf9ec30698bd3caa59d322 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:11 +0000 Subject: [PATCH 063/104] arm64: uaccess: Add PAN helper Add __uaccess_{en,dis}able_hw_pan() helpers to set/clear the PSTATE.PAN bit. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/uaccess.h | 12 ++++++++++++ arch/arm64/kernel/suspend.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 6eadf55ebaf0..3821fab01d7d 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -168,6 +168,18 @@ static inline bool uaccess_ttbr0_enable(void) } #endif +static inline void __uaccess_disable_hw_pan(void) +{ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, + CONFIG_ARM64_PAN)); +} + +static inline void __uaccess_enable_hw_pan(void) +{ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, + CONFIG_ARM64_PAN)); +} + #define __uaccess_disable(alt) \ do { \ if (!uaccess_ttbr0_disable()) \ diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 3fe5ad884418..a307b9e13392 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -51,8 +52,7 @@ void notrace __cpu_suspend_exit(void) * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, - CONFIG_ARM64_PAN)); + __uaccess_enable_hw_pan(); uao_thread_switch(current); /* From f5df26961853d6809d704cedcaf082c57f635a64 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:12 +0000 Subject: [PATCH 064/104] arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sdei.h | 47 +++++- arch/arm64/include/asm/stacktrace.h | 3 + arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/asm-offsets.c | 5 + arch/arm64/kernel/entry.S | 101 +++++++++++++ arch/arm64/kernel/sdei.c | 219 ++++++++++++++++++++++++++++ arch/arm64/kernel/smp.c | 11 +- 7 files changed, 384 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/kernel/sdei.c diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h index 59f26b6e673d..d58a31ab525a 100644 --- a/arch/arm64/include/asm/sdei.h +++ b/arch/arm64/include/asm/sdei.h @@ -3,6 +3,49 @@ #ifndef __ASM_SDEI_H #define __ASM_SDEI_H -/* Later patches add the arch specific bits */ +/* Values for sdei_exit_mode */ +#define SDEI_EXIT_HVC 0 +#define SDEI_EXIT_SMC 1 -#endif /* __ASM_SDEI_H */ +#define SDEI_STACK_SIZE IRQ_STACK_SIZE + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +extern unsigned long sdei_exit_mode; + +/* Software Delegated Exception entry point from firmware*/ +asmlinkage void __sdei_asm_handler(unsigned long event_num, unsigned long arg, + unsigned long pc, unsigned long pstate); + +/* + * The above entry point does the minimum to call C code. This function does + * anything else, before calling the driver. + */ +struct sdei_registered_event; +asmlinkage unsigned long __sdei_handler(struct pt_regs *regs, + struct sdei_registered_event *arg); + +unsigned long sdei_arch_get_entry_point(int conduit); +#define sdei_arch_get_entry_point(x) sdei_arch_get_entry_point(x) + +bool _on_sdei_stack(unsigned long sp); +static inline bool on_sdei_stack(unsigned long sp) +{ + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return false; + if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE)) + return false; + if (in_nmi()) + return _on_sdei_stack(sp); + + return false; +} + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_SDEI_H */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 6ad30776e984..472ef944e932 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -22,6 +22,7 @@ #include #include +#include struct stackframe { unsigned long fp; @@ -85,6 +86,8 @@ static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp return true; if (on_overflow_stack(sp)) return true; + if (on_sdei_stack(sp)) + return true; return false; } diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 0c760db04858..b87541360f43 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -52,6 +52,7 @@ arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o ifeq ($(CONFIG_KVM),y) arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index af247d10252f..1dcc493f5765 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -18,6 +18,7 @@ * along with this program. If not, see . */ +#include #include #include #include @@ -157,6 +158,10 @@ int main(void) BLANK(); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif +#ifdef CONFIG_ARM_SDE_INTERFACE + DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs)); + DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority)); #endif return 0; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 07a7d4db8ec4..40bf5083d182 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1153,3 +1153,104 @@ ENTRY(ret_from_fork) b ret_to_user ENDPROC(ret_from_fork) NOKPROBE(ret_from_fork) + +#ifdef CONFIG_ARM_SDE_INTERFACE + +#include +#include + +/* + * Software Delegated Exception entry point. + * + * x0: Event number + * x1: struct sdei_registered_event argument from registration time. + * x2: interrupted PC + * x3: interrupted PSTATE + * + * Firmware has preserved x0->x17 for us, we must save/restore the rest to + * follow SMC-CC. We save (or retrieve) all the registers as the handler may + * want them. + */ +ENTRY(__sdei_asm_handler) + stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] + stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] + stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] + stp x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4] + stp x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5] + stp x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6] + stp x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7] + stp x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8] + stp x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9] + stp x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10] + stp x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11] + stp x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12] + stp x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13] + stp x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14] + mov x4, sp + stp lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR] + + mov x19, x1 + +#ifdef CONFIG_VMAP_STACK + /* + * entry.S may have been using sp as a scratch register, find whether + * this is a normal or critical event and switch to the appropriate + * stack for this CPU. + */ + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] + cbnz w4, 1f + ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 + b 2f +1: ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6 +2: mov x6, #SDEI_STACK_SIZE + add x5, x5, x6 + mov sp, x5 +#endif + + /* + * We may have interrupted userspace, or a guest, or exit-from or + * return-to either of these. We can't trust sp_el0, restore it. + */ + mrs x28, sp_el0 + ldr_this_cpu dst=x0, sym=__entry_task, tmp=x1 + msr sp_el0, x0 + + /* If we interrupted the kernel point to the previous stack/frame. */ + and x0, x3, #0xc + mrs x1, CurrentEL + cmp x0, x1 + csel x29, x29, xzr, eq // fp, or zero + csel x4, x2, xzr, eq // elr, or zero + + stp x29, x4, [sp, #-16]! + mov x29, sp + + add x0, x19, #SDEI_EVENT_INTREGS + mov x1, x19 + bl __sdei_handler + + msr sp_el0, x28 + /* restore regs >x17 that we clobbered */ + ldp x28, x29, [x19, #SDEI_EVENT_INTREGS + 16 * 14] + ldp lr, x4, [x19, #SDEI_EVENT_INTREGS + S_LR] + mov sp, x4 + ldp x18, x19, [x19, #SDEI_EVENT_INTREGS + 16 * 9] + + mov x1, x0 // address to complete_and_resume + /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ + cmp x0, #1 + mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE + mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME + csel x0, x2, x3, ls + + /* On success, this call never returns... */ + ldr_l x2, sdei_exit_mode + cmp x2, #SDEI_EXIT_SMC + b.ne 1f + smc #0 + b . +1: hvc #0 + b . +ENDPROC(__sdei_asm_handler) +NOKPROBE(__sdei_asm_handler) +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c new file mode 100644 index 000000000000..f9dffacaa5d6 --- /dev/null +++ b/arch/arm64/kernel/sdei.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#define pr_fmt(fmt) "sdei: " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +unsigned long sdei_exit_mode; + +/* + * VMAP'd stacks checking for stack overflow on exception using sp as a scratch + * register, meaning SDEI has to switch to its own stack. We need two stacks as + * a critical event may interrupt a normal event that has just taken a + * synchronous exception, and is using sp as scratch register. For a critical + * event interrupting a normal event, we can't reliably tell if we were on the + * sdei stack. + * For now, we allocate stacks when the driver is probed. + */ +DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); + +#ifdef CONFIG_VMAP_STACK +DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); +#endif + +static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = per_cpu(*ptr, cpu); + if (p) { + per_cpu(*ptr, cpu) = NULL; + vfree(p); + } +} + +static void free_sdei_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + _free_sdei_stack(&sdei_stack_normal_ptr, cpu); + _free_sdei_stack(&sdei_stack_critical_ptr, cpu); + } +} + +static int _init_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = arch_alloc_vmap_stack(SDEI_STACK_SIZE, cpu_to_node(cpu)); + if (!p) + return -ENOMEM; + per_cpu(*ptr, cpu) = p; + + return 0; +} + +static int init_sdei_stacks(void) +{ + int cpu; + int err = 0; + + for_each_possible_cpu(cpu) { + err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_stack(&sdei_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_stacks(); + + return err; +} + +bool _on_sdei_stack(unsigned long sp) +{ + unsigned long low, high; + + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return false; + + low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); + high = low + SDEI_STACK_SIZE; + + if (low <= sp && sp < high) + return true; + + low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); + high = low + SDEI_STACK_SIZE; + + return (low <= sp && sp < high); +} + +unsigned long sdei_arch_get_entry_point(int conduit) +{ + /* + * SDEI works between adjacent exception levels. If we booted at EL1 we + * assume a hypervisor is marshalling events. If we booted at EL2 and + * dropped to EL1 because we don't support VHE, then we can't support + * SDEI. + */ + if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { + pr_err("Not supported on this hardware/boot configuration\n"); + return 0; + } + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + if (init_sdei_stacks()) + return 0; + } + + sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; + return (unsigned long)__sdei_asm_handler; +} + +/* + * __sdei_handler() returns one of: + * SDEI_EV_HANDLED - success, return to the interrupted context. + * SDEI_EV_FAILED - failure, return this error code to firmare. + * virtual-address - success, return to this address. + */ +static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, + struct sdei_registered_event *arg) +{ + u32 mode; + int i, err = 0; + const int clobbered_registers = 4; + u64 elr = read_sysreg(elr_el1); + u32 kernel_mode = read_sysreg(CurrentEL) | 1; /* +SPSel */ + unsigned long vbar = read_sysreg(vbar_el1); + + /* Retrieve the missing registers values */ + for (i = 0; i < clobbered_registers; i++) { + /* from within the handler, this call always succeeds */ + sdei_api_event_context(i, ®s->regs[i]); + } + + /* + * We didn't take an exception to get here, set PAN. UAO will be cleared + * by sdei_event_handler()s set_fs(USER_DS) call. + */ + __uaccess_enable_hw_pan(); + + err = sdei_event_handler(regs, arg); + if (err) + return SDEI_EV_FAILED; + + if (elr != read_sysreg(elr_el1)) { + /* + * We took a synchronous exception from the SDEI handler. + * This could deadlock, and if you interrupt KVM it will + * hyp-panic instead. + */ + pr_warn("unsafe: exception during handler\n"); + } + + mode = regs->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK); + + /* + * If we interrupted the kernel with interrupts masked, we always go + * back to wherever we came from. + */ + if (mode == kernel_mode && !interrupts_enabled(regs)) + return SDEI_EV_HANDLED; + + /* + * Otherwise, we pretend this was an IRQ. This lets user space tasks + * receive signals before we return to them, and KVM to invoke it's + * world switch to do the same. + * + * See DDI0487B.a Table D1-7 'Vector offsets from vector table base + * address'. + */ + if (mode == kernel_mode) + return vbar + 0x280; + else if (mode & PSR_MODE32_BIT) + return vbar + 0x680; + + return vbar + 0x480; +} + + +asmlinkage __kprobes notrace unsigned long +__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) +{ + unsigned long ret; + bool do_nmi_exit = false; + + /* + * nmi_enter() deals with printk() re-entrance and use of RCU when + * RCU believed this CPU was idle. Because critical events can + * interrupt normal events, we may already be in_nmi(). + */ + if (!in_nmi()) { + nmi_enter(); + do_nmi_exit = true; + } + + ret = _sdei_handler(regs, arg); + + if (do_nmi_exit) + nmi_exit(); + + return ret; +} diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 551eb07c53b6..3b8ad7be9c33 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -836,6 +837,7 @@ static void ipi_cpu_stop(unsigned int cpu) set_cpu_online(cpu, false); local_daif_mask(); + sdei_mask_local_cpu(); while (1) cpu_relax(); @@ -853,6 +855,7 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) atomic_dec(&waiting_for_crash_ipi); local_irq_disable(); + sdei_mask_local_cpu(); #ifdef CONFIG_HOTPLUG_CPU if (cpu_ops[cpu]->cpu_die) @@ -972,6 +975,8 @@ void smp_send_stop(void) if (num_online_cpus() > 1) pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(cpu_online_mask)); + + sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE @@ -990,8 +995,10 @@ void crash_smp_send_stop(void) cpus_stopped = 1; - if (num_online_cpus() == 1) + if (num_online_cpus() == 1) { + sdei_mask_local_cpu(); return; + } cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); @@ -1009,6 +1016,8 @@ void crash_smp_send_stop(void) if (atomic_read(&waiting_for_crash_ipi) > 0) pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(&mask)); + + sdei_mask_local_cpu(); } bool smp_crash_stop_failed(void) From da351827240e1705cca64bb8ae526f0ce1068048 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:13 +0000 Subject: [PATCH 065/104] firmware: arm_sdei: Add support for CPU and system power states When a CPU enters an idle lower-power state or is powering off, we need to mask SDE events so that no events can be delivered while we are messing with the MMU as the registered entry points won't be valid. If the system reboots, we want to unregister all events and mask the CPUs. For kexec this allows us to hand a clean slate to the next kernel instead of relying on it to call sdei_{private,system}_data_reset(). For hibernate we unregister all events and re-register them on restore, in case we restored with the SDE code loaded at a different address. (e.g. KASLR). Add all the notifiers necessary to do this. We only support shared events so all events are left registered and enabled over CPU hotplug. Reviewed-by: Lorenzo Pieralisi Signed-off-by: James Morse [catalin.marinas@arm.com: added CPU_PM_ENTER_FAILED case] Signed-off-by: Catalin Marinas --- drivers/firmware/arm_sdei.c | 256 +++++++++++++++++++++++++++++++++++- include/linux/cpuhotplug.h | 1 + 2 files changed, 256 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 8da173cc7e43..c503bc5222ed 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -14,12 +16,15 @@ #include #include #include +#include #include #include #include #include +#include #include #include +#include #include #include #include @@ -37,7 +42,11 @@ static asmlinkage void (*sdei_firmware_call)(unsigned long function_id, static unsigned long sdei_entry_point; struct sdei_event { + /* These three are protected by the sdei_list_lock */ struct list_head list; + bool reregister; + bool reenable; + u32 event_num; u8 type; u8 priority; @@ -253,6 +262,11 @@ static void _ipi_mask_cpu(void *ignored) sdei_mask_local_cpu(); } +static int sdei_cpuhp_down(unsigned int ignored) +{ + return sdei_mask_local_cpu(); +} + int sdei_unmask_local_cpu(void) { int err; @@ -274,6 +288,11 @@ static void _ipi_unmask_cpu(void *ignored) sdei_unmask_local_cpu(); } +static int sdei_cpuhp_up(unsigned int ignored) +{ + return sdei_unmask_local_cpu(); +} + static void _ipi_private_reset(void *ignored) { int err; @@ -330,6 +349,10 @@ int sdei_event_enable(u32 event_num) return -ENOENT; } + spin_lock(&sdei_list_lock); + event->reenable = true; + spin_unlock(&sdei_list_lock); + if (event->type == SDEI_EVENT_TYPE_SHARED) err = sdei_api_event_enable(event->event_num); mutex_unlock(&sdei_events_lock); @@ -356,6 +379,10 @@ int sdei_event_disable(u32 event_num) return -ENOENT; } + spin_lock(&sdei_list_lock); + event->reenable = false; + spin_unlock(&sdei_list_lock); + if (event->type == SDEI_EVENT_TYPE_SHARED) err = sdei_api_event_disable(event->event_num); mutex_unlock(&sdei_events_lock); @@ -374,6 +401,11 @@ static int _sdei_event_unregister(struct sdei_event *event) { lockdep_assert_held(&sdei_events_lock); + spin_lock(&sdei_list_lock); + event->reregister = false; + event->reenable = false; + spin_unlock(&sdei_list_lock); + if (event->type == SDEI_EVENT_TYPE_SHARED) return sdei_api_event_unregister(event->event_num); @@ -408,6 +440,31 @@ int sdei_event_unregister(u32 event_num) } EXPORT_SYMBOL(sdei_event_unregister); +/* + * unregister events, but don't destroy them as they are re-registered by + * sdei_reregister_shared(). + */ +static int sdei_unregister_shared(void) +{ + int err = 0; + struct sdei_event *event; + + mutex_lock(&sdei_events_lock); + spin_lock(&sdei_list_lock); + list_for_each_entry(event, &sdei_list, list) { + if (event->type != SDEI_EVENT_TYPE_SHARED) + continue; + + err = _sdei_event_unregister(event); + if (err) + break; + } + spin_unlock(&sdei_list_lock); + mutex_unlock(&sdei_events_lock); + + return err; +} + static int sdei_api_event_register(u32 event_num, unsigned long entry_point, void *arg, u64 flags, u64 affinity) { @@ -465,6 +522,175 @@ int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) } EXPORT_SYMBOL(sdei_event_register); +static int sdei_reregister_event(struct sdei_event *event) +{ + int err; + + lockdep_assert_held(&sdei_events_lock); + + err = _sdei_event_register(event); + if (err) { + pr_err("Failed to re-register event %u\n", event->event_num); + sdei_event_destroy(event); + return err; + } + + if (event->reenable) { + if (event->type == SDEI_EVENT_TYPE_SHARED) + err = sdei_api_event_enable(event->event_num); + } + + if (err) + pr_err("Failed to re-enable event %u\n", event->event_num); + + return err; +} + +static int sdei_reregister_shared(void) +{ + int err = 0; + struct sdei_event *event; + + mutex_lock(&sdei_events_lock); + spin_lock(&sdei_list_lock); + list_for_each_entry(event, &sdei_list, list) { + if (event->type != SDEI_EVENT_TYPE_SHARED) + continue; + + if (event->reregister) { + err = sdei_reregister_event(event); + if (err) + break; + } + } + spin_unlock(&sdei_list_lock); + mutex_unlock(&sdei_events_lock); + + return err; +} + +/* When entering idle, mask/unmask events for this cpu */ +static int sdei_pm_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + int rv; + + switch (action) { + case CPU_PM_ENTER: + rv = sdei_mask_local_cpu(); + break; + case CPU_PM_EXIT: + case CPU_PM_ENTER_FAILED: + rv = sdei_unmask_local_cpu(); + break; + default: + return NOTIFY_DONE; + } + + if (rv) + return notifier_from_errno(rv); + + return NOTIFY_OK; +} + +static struct notifier_block sdei_pm_nb = { + .notifier_call = sdei_pm_notifier, +}; + +static int sdei_device_suspend(struct device *dev) +{ + on_each_cpu(_ipi_mask_cpu, NULL, true); + + return 0; +} + +static int sdei_device_resume(struct device *dev) +{ + on_each_cpu(_ipi_unmask_cpu, NULL, true); + + return 0; +} + +/* + * We need all events to be reregistered when we resume from hibernate. + * + * The sequence is freeze->thaw. Reboot. freeze->restore. We unregister + * events during freeze, then re-register and re-enable them during thaw + * and restore. + */ +static int sdei_device_freeze(struct device *dev) +{ + int err; + + cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING); + + err = sdei_unregister_shared(); + if (err) + return err; + + return 0; +} + +static int sdei_device_thaw(struct device *dev) +{ + int err; + + /* re-register shared events */ + err = sdei_reregister_shared(); + if (err) { + pr_warn("Failed to re-register shared events...\n"); + sdei_mark_interface_broken(); + return err; + } + + err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI", + &sdei_cpuhp_up, &sdei_cpuhp_down); + if (err) + pr_warn("Failed to re-register CPU hotplug notifier...\n"); + + return err; +} + +static int sdei_device_restore(struct device *dev) +{ + int err; + + err = sdei_platform_reset(); + if (err) + return err; + + return sdei_device_thaw(dev); +} + +static const struct dev_pm_ops sdei_pm_ops = { + .suspend = sdei_device_suspend, + .resume = sdei_device_resume, + .freeze = sdei_device_freeze, + .thaw = sdei_device_thaw, + .restore = sdei_device_restore, +}; + +/* + * Mask all CPUs and unregister all events on panic, reboot or kexec. + */ +static int sdei_reboot_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + /* + * We are going to reset the interface, after this there is no point + * doing work when we take CPUs offline. + */ + cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING); + + sdei_platform_reset(); + + return NOTIFY_OK; +} + +static struct notifier_block sdei_reboot_nb = { + .notifier_call = sdei_reboot_notifier, +}; + static void sdei_smccc_smc(unsigned long function_id, unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, @@ -547,9 +773,36 @@ static int sdei_probe(struct platform_device *pdev) return 0; } - on_each_cpu(&_ipi_unmask_cpu, NULL, false); + err = cpu_pm_register_notifier(&sdei_pm_nb); + if (err) { + pr_warn("Failed to register CPU PM notifier...\n"); + goto error; + } + + err = register_reboot_notifier(&sdei_reboot_nb); + if (err) { + pr_warn("Failed to register reboot notifier...\n"); + goto remove_cpupm; + } + + err = cpuhp_setup_state(CPUHP_AP_ARM_SDEI_STARTING, "SDEI", + &sdei_cpuhp_up, &sdei_cpuhp_down); + if (err) { + pr_warn("Failed to register CPU hotplug notifier...\n"); + goto remove_reboot; + } return 0; + +remove_reboot: + unregister_reboot_notifier(&sdei_reboot_nb); + +remove_cpupm: + cpu_pm_unregister_notifier(&sdei_pm_nb); + +error: + sdei_mark_interface_broken(); + return err; } static const struct of_device_id sdei_of_match[] = { @@ -560,6 +813,7 @@ static const struct of_device_id sdei_of_match[] = { static struct platform_driver sdei_driver = { .driver = { .name = "sdei", + .pm = &sdei_pm_ops, .of_match_table = sdei_of_match, }, .probe = sdei_probe, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 201ab7267986..87b505a48a94 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -109,6 +109,7 @@ enum cpuhp_state { CPUHP_AP_PERF_XTENSA_STARTING, CPUHP_AP_PERF_METAG_STARTING, CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, + CPUHP_AP_ARM_SDEI_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, From f92b5462a2f22d13a75dc663f7b2fac16a3e61cb Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:14 +0000 Subject: [PATCH 066/104] firmware: arm_sdei: add support for CPU private events Private SDE events are per-cpu, and need to be registered and enabled on each CPU. Hide this detail from the caller by adapting our {,un}register and {en,dis}able calls to send an IPI to each CPU if the event is private. CPU private events are unregistered when the CPU is powered-off, and re-registered when the CPU is brought back online. This saves bringing secondary cores back online to call private_reset() on shutdown, kexec and resume from hibernate. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- drivers/firmware/arm_sdei.c | 206 +++++++++++++++++++++++++++++++++--- 1 file changed, 193 insertions(+), 13 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index c503bc5222ed..0c735363f54d 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -52,7 +54,13 @@ struct sdei_event { u8 priority; /* This pointer is handed to firmware as the event argument. */ - struct sdei_registered_event *registered; + union { + /* Shared events */ + struct sdei_registered_event *registered; + + /* CPU private events */ + struct sdei_registered_event __percpu *private_registered; + }; }; /* Take the mutex for any API call or modification. Take the mutex first. */ @@ -62,6 +70,34 @@ static DEFINE_MUTEX(sdei_events_lock); static DEFINE_SPINLOCK(sdei_list_lock); static LIST_HEAD(sdei_list); +/* Private events are registered/enabled via IPI passing one of these */ +struct sdei_crosscall_args { + struct sdei_event *event; + atomic_t errors; + int first_error; +}; + +#define CROSSCALL_INIT(arg, event) (arg.event = event, \ + arg.first_error = 0, \ + atomic_set(&arg.errors, 0)) + +static inline int sdei_do_cross_call(void *fn, struct sdei_event * event) +{ + struct sdei_crosscall_args arg; + + CROSSCALL_INIT(arg, event); + on_each_cpu(fn, &arg, true); + + return arg.first_error; +} + +static inline void +sdei_cross_call_return(struct sdei_crosscall_args *arg, int err) +{ + if (err && (atomic_inc_return(&arg->errors) == 1)) + arg->first_error = err; +} + static int sdei_to_linux_errno(unsigned long sdei_err) { switch (sdei_err) { @@ -207,6 +243,26 @@ static struct sdei_event *sdei_event_create(u32 event_num, reg->callback = cb; reg->callback_arg = cb_arg; event->registered = reg; + } else { + int cpu; + struct sdei_registered_event __percpu *regs; + + regs = alloc_percpu(struct sdei_registered_event); + if (!regs) { + kfree(event); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + reg = per_cpu_ptr(regs, cpu); + + reg->event_num = event->event_num; + reg->priority = event->priority; + reg->callback = cb; + reg->callback_arg = cb_arg; + } + + event->private_registered = regs; } if (sdei_event_find(event_num)) { @@ -232,6 +288,8 @@ static void sdei_event_destroy(struct sdei_event *event) if (event->type == SDEI_EVENT_TYPE_SHARED) kfree(event->registered); + else + free_percpu(event->private_registered); kfree(event); } @@ -262,11 +320,6 @@ static void _ipi_mask_cpu(void *ignored) sdei_mask_local_cpu(); } -static int sdei_cpuhp_down(unsigned int ignored) -{ - return sdei_mask_local_cpu(); -} - int sdei_unmask_local_cpu(void) { int err; @@ -288,11 +341,6 @@ static void _ipi_unmask_cpu(void *ignored) sdei_unmask_local_cpu(); } -static int sdei_cpuhp_up(unsigned int ignored) -{ - return sdei_unmask_local_cpu(); -} - static void _ipi_private_reset(void *ignored) { int err; @@ -337,6 +385,19 @@ static int sdei_api_event_enable(u32 event_num) 0, NULL); } +/* Called directly by the hotplug callbacks */ +static void _local_event_enable(void *data) +{ + int err; + struct sdei_crosscall_args *arg = data; + + WARN_ON_ONCE(preemptible()); + + err = sdei_api_event_enable(arg->event->event_num); + + sdei_cross_call_return(arg, err); +} + int sdei_event_enable(u32 event_num) { int err = -EINVAL; @@ -355,6 +416,8 @@ int sdei_event_enable(u32 event_num) if (event->type == SDEI_EVENT_TYPE_SHARED) err = sdei_api_event_enable(event->event_num); + else + err = sdei_do_cross_call(_local_event_enable, event); mutex_unlock(&sdei_events_lock); return err; @@ -367,6 +430,16 @@ static int sdei_api_event_disable(u32 event_num) 0, 0, NULL); } +static void _ipi_event_disable(void *data) +{ + int err; + struct sdei_crosscall_args *arg = data; + + err = sdei_api_event_disable(arg->event->event_num); + + sdei_cross_call_return(arg, err); +} + int sdei_event_disable(u32 event_num) { int err = -EINVAL; @@ -385,6 +458,8 @@ int sdei_event_disable(u32 event_num) if (event->type == SDEI_EVENT_TYPE_SHARED) err = sdei_api_event_disable(event->event_num); + else + err = sdei_do_cross_call(_ipi_event_disable, event); mutex_unlock(&sdei_events_lock); return err; @@ -397,6 +472,19 @@ static int sdei_api_event_unregister(u32 event_num) 0, 0, 0, NULL); } +/* Called directly by the hotplug callbacks */ +static void _local_event_unregister(void *data) +{ + int err; + struct sdei_crosscall_args *arg = data; + + WARN_ON_ONCE(preemptible()); + + err = sdei_api_event_unregister(arg->event->event_num); + + sdei_cross_call_return(arg, err); +} + static int _sdei_event_unregister(struct sdei_event *event) { lockdep_assert_held(&sdei_events_lock); @@ -409,7 +497,7 @@ static int _sdei_event_unregister(struct sdei_event *event) if (event->type == SDEI_EVENT_TYPE_SHARED) return sdei_api_event_unregister(event->event_num); - return -EINVAL; + return sdei_do_cross_call(_local_event_unregister, event); } int sdei_event_unregister(u32 event_num) @@ -473,17 +561,50 @@ static int sdei_api_event_register(u32 event_num, unsigned long entry_point, flags, affinity, NULL); } +/* Called directly by the hotplug callbacks */ +static void _local_event_register(void *data) +{ + int err; + struct sdei_registered_event *reg; + struct sdei_crosscall_args *arg = data; + + WARN_ON(preemptible()); + + reg = per_cpu_ptr(arg->event->private_registered, smp_processor_id()); + err = sdei_api_event_register(arg->event->event_num, sdei_entry_point, + reg, 0, 0); + + sdei_cross_call_return(arg, err); +} + static int _sdei_event_register(struct sdei_event *event) { + int err; + lockdep_assert_held(&sdei_events_lock); + spin_lock(&sdei_list_lock); + event->reregister = true; + spin_unlock(&sdei_list_lock); + if (event->type == SDEI_EVENT_TYPE_SHARED) return sdei_api_event_register(event->event_num, sdei_entry_point, event->registered, SDEI_EVENT_REGISTER_RM_ANY, 0); - return -EINVAL; + + err = sdei_do_cross_call(_local_event_register, event); + if (err) { + spin_lock(&sdei_list_lock); + event->reregister = false; + event->reenable = false; + spin_unlock(&sdei_list_lock); + + sdei_do_cross_call(_local_event_unregister, event); + } + + return err; } int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) @@ -538,6 +659,8 @@ static int sdei_reregister_event(struct sdei_event *event) if (event->reenable) { if (event->type == SDEI_EVENT_TYPE_SHARED) err = sdei_api_event_enable(event->event_num); + else + err = sdei_do_cross_call(_local_event_enable, event); } if (err) @@ -569,6 +692,62 @@ static int sdei_reregister_shared(void) return err; } +static int sdei_cpuhp_down(unsigned int cpu) +{ + struct sdei_event *event; + struct sdei_crosscall_args arg; + + /* un-register private events */ + spin_lock(&sdei_list_lock); + list_for_each_entry(event, &sdei_list, list) { + if (event->type == SDEI_EVENT_TYPE_SHARED) + continue; + + CROSSCALL_INIT(arg, event); + /* call the cross-call function locally... */ + _local_event_unregister(&arg); + if (arg.first_error) + pr_err("Failed to unregister event %u: %d\n", + event->event_num, arg.first_error); + } + spin_unlock(&sdei_list_lock); + + return sdei_mask_local_cpu(); +} + +static int sdei_cpuhp_up(unsigned int cpu) +{ + struct sdei_event *event; + struct sdei_crosscall_args arg; + + /* re-register/enable private events */ + spin_lock(&sdei_list_lock); + list_for_each_entry(event, &sdei_list, list) { + if (event->type == SDEI_EVENT_TYPE_SHARED) + continue; + + if (event->reregister) { + CROSSCALL_INIT(arg, event); + /* call the cross-call function locally... */ + _local_event_register(&arg); + if (arg.first_error) + pr_err("Failed to re-register event %u: %d\n", + event->event_num, arg.first_error); + } + + if (event->reenable) { + CROSSCALL_INIT(arg, event); + _local_event_enable(&arg); + if (arg.first_error) + pr_err("Failed to re-enable event %u: %d\n", + event->event_num, arg.first_error); + } + } + spin_unlock(&sdei_list_lock); + + return sdei_unmask_local_cpu(); +} + /* When entering idle, mask/unmask events for this cpu */ static int sdei_pm_notifier(struct notifier_block *nb, unsigned long action, void *data) @@ -622,6 +801,7 @@ static int sdei_device_freeze(struct device *dev) { int err; + /* unregister private events */ cpuhp_remove_state(CPUHP_AP_ARM_SDEI_STARTING); err = sdei_unregister_shared(); From fa31ab77ced9fbab87fbac4fca3682009b7f9be2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:15 +0000 Subject: [PATCH 067/104] arm64: acpi: Remove __init from acpi_psci_use_hvc() for use by SDEI SDEI inherits the 'use hvc' bit that is also used by PSCI. PSCI does all its initialisation early, SDEI does its late. Remove the __init annotation from acpi_psci_use_hvc(). Acked-by: Lorenzo Pieralisi Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi.c | 2 +- include/linux/psci.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index b3162715ed78..252396a96c78 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -117,7 +117,7 @@ bool __init acpi_psci_present(void) } /* Whether HVC must be used instead of SMC as the PSCI conduit */ -bool __init acpi_psci_use_hvc(void) +bool acpi_psci_use_hvc(void) { return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; } diff --git a/include/linux/psci.h b/include/linux/psci.h index 6306ab10af18..f724fd8c78e8 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -47,10 +47,11 @@ static inline int psci_dt_init(void) { return 0; } #if defined(CONFIG_ARM_PSCI_FW) && defined(CONFIG_ACPI) int __init psci_acpi_init(void); bool __init acpi_psci_present(void); -bool __init acpi_psci_use_hvc(void); +bool acpi_psci_use_hvc(void); #else static inline int psci_acpi_init(void) { return 0; } static inline bool acpi_psci_present(void) { return false; } +static inline bool acpi_psci_use_hvc(void) {return false; } #endif #endif /* __LINUX_PSCI_H */ From 677a60bd2003ff5517a0b502365112531446a3e3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:16 +0000 Subject: [PATCH 068/104] firmware: arm_sdei: Discover SDEI support via ACPI SDEI defines a new ACPI table to indicate the presence of the interface. The conduit is discovered in the same way as PSCI. For ACPI we need to create the platform device ourselves as SDEI doesn't have an entry in the DSDT. The SDEI platform device should be created after ACPI has been initialised so that we can parse the table, but before GHES devices are created, which may register SDE events if they use SDEI as their notification type. Reviewed-by: Lorenzo Pieralisi Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Catalin Marinas --- drivers/firmware/arm_sdei.c | 41 ++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 0c735363f54d..8f6563c595e5 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -908,6 +908,14 @@ static int sdei_get_conduit(struct platform_device *pdev) } pr_warn("invalid \"method\" property: %s\n", method); + } else if (IS_ENABLED(CONFIG_ACPI) && !acpi_disabled) { + if (acpi_psci_use_hvc()) { + sdei_firmware_call = &sdei_smccc_hvc; + return CONDUIT_HVC; + } else { + sdei_firmware_call = &sdei_smccc_smc; + return CONDUIT_SMC; + } } return CONDUIT_INVALID; @@ -1021,14 +1029,45 @@ static bool __init sdei_present_dt(void) return true; } +static bool __init sdei_present_acpi(void) +{ + acpi_status status; + struct platform_device *pdev; + struct acpi_table_header *sdei_table_header; + + if (acpi_disabled) + return false; + + status = acpi_get_table(ACPI_SIG_SDEI, 0, &sdei_table_header); + if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + + pr_info("Failed to get ACPI:SDEI table, %s\n", msg); + } + if (ACPI_FAILURE(status)) + return false; + + pdev = platform_device_register_simple(sdei_driver.driver.name, 0, NULL, + 0); + if (IS_ERR(pdev)) + return false; + + return true; +} + static int __init sdei_init(void) { - if (sdei_present_dt()) + if (sdei_present_dt() || sdei_present_acpi()) platform_driver_register(&sdei_driver); return 0; } +/* + * On an ACPI system SDEI needs to be ready before HEST:GHES tries to register + * its events. ACPI is initialised from a subsys_initcall(), GHES is initialised + * by device_initcall(). We want to be called in the middle. + */ subsys_initcall_sync(sdei_init); int sdei_event_handler(struct pt_regs *regs, From 83f8ee3a73f551aebb5b0bb029feaf6936615730 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:17 +0000 Subject: [PATCH 069/104] arm64: mmu: add the entry trampolines start/end section markers into sections.h SDEI needs to calculate an offset in the trampoline page too. Move the extern char[] to sections.h. This patch just moves code around. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sections.h | 1 + arch/arm64/mm/mmu.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 941267caa39c..caab039d6305 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -28,5 +28,6 @@ extern char __initdata_begin[], __initdata_end[]; extern char __inittext_begin[], __inittext_end[]; extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; +extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4071602031ed..62d7abb2f710 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -529,8 +529,6 @@ early_param("rodata", parse_rodata); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { - extern char __entry_tramp_text_start[]; - pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); From 79e9aa59dc29a995921fb01e64cd36b73cf5abe0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Jan 2018 15:38:18 +0000 Subject: [PATCH 070/104] arm64: sdei: Add trampoline code for remapping the kernel When CONFIG_UNMAP_KERNEL_AT_EL0 is set the SDEI entry point and the rest of the kernel may be unmapped when we take an event. If this may be the case, use an entry trampoline that can switch to the kernel page tables. We can't use the provided PSTATE to determine whether to switch page tables as we may have interrupted the kernel's entry trampoline, (or a normal-priority event that interrupted the kernel's entry trampoline). Instead test for a user ASID in ttbr1_el1. Save a value in regs->addr_limit to indicate whether we need to restore the original ASID when returning from this event. This value is only used by do_page_fault(), which we don't call with the SDEI regs. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 3 +- arch/arm64/include/asm/sdei.h | 6 +++ arch/arm64/kernel/entry.S | 98 +++++++++++++++++++++++++++++++---- arch/arm64/kernel/sdei.c | 20 ++++++- 4 files changed, 113 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 6dd83d75b82a..a050d4f3615d 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -17,7 +17,8 @@ #define __ASM_MMU_H #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ -#define USER_ASID_FLAG (UL(1) << 48) +#define USER_ASID_BIT 48 +#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) #define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h index d58a31ab525a..e073e6886685 100644 --- a/arch/arm64/include/asm/sdei.h +++ b/arch/arm64/include/asm/sdei.h @@ -23,6 +23,12 @@ extern unsigned long sdei_exit_mode; asmlinkage void __sdei_asm_handler(unsigned long event_num, unsigned long arg, unsigned long pc, unsigned long pstate); +/* and its CONFIG_UNMAP_KERNEL_AT_EL0 trampoline */ +asmlinkage void __sdei_asm_entry_trampoline(unsigned long event_num, + unsigned long arg, + unsigned long pc, + unsigned long pstate); + /* * The above entry point does the minimum to call C code. This function does * anything else, before calling the driver. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 40bf5083d182..812416a09a9c 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1159,6 +1159,78 @@ NOKPROBE(ret_from_fork) #include #include +.macro sdei_handler_exit exit_mode + /* On success, this call never returns... */ + cmp \exit_mode, #SDEI_EXIT_SMC + b.ne 99f + smc #0 + b . +99: hvc #0 + b . +.endm + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * The regular SDEI entry point may have been unmapped along with the rest of + * the kernel. This trampoline restores the kernel mapping to make the x1 memory + * argument accessible. + * + * This clobbers x4, __sdei_handler() will restore this from firmware's + * copy. + */ +.ltorg +.pushsection ".entry.tramp.text", "ax" +ENTRY(__sdei_asm_entry_trampoline) + mrs x4, ttbr1_el1 + tbz x4, #USER_ASID_BIT, 1f + + tramp_map_kernel tmp=x4 + isb + mov x4, xzr + + /* + * Use reg->interrupted_regs.addr_limit to remember whether to unmap + * the kernel on exit. + */ +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + +#ifdef CONFIG_RANDOMIZE_BASE + adr x4, tramp_vectors + PAGE_SIZE + add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler + ldr x4, [x4] +#else + ldr x4, =__sdei_asm_handler +#endif + br x4 +ENDPROC(__sdei_asm_entry_trampoline) +NOKPROBE(__sdei_asm_entry_trampoline) + +/* + * Make the exit call and restore the original ttbr1_el1 + * + * x0 & x1: setup for the exit API call + * x2: exit_mode + * x4: struct sdei_registered_event argument from registration time. + */ +ENTRY(__sdei_asm_exit_trampoline) + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + cbnz x4, 1f + + tramp_unmap_kernel tmp=x4 + +1: sdei_handler_exit exit_mode=x2 +ENDPROC(__sdei_asm_exit_trampoline) +NOKPROBE(__sdei_asm_exit_trampoline) + .ltorg +.popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE +.pushsection ".rodata", "a" +__sdei_asm_trampoline_next_handler: + .quad __sdei_asm_handler +.popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Software Delegated Exception entry point. * @@ -1166,6 +1238,7 @@ NOKPROBE(ret_from_fork) * x1: struct sdei_registered_event argument from registration time. * x2: interrupted PC * x3: interrupted PSTATE + * x4: maybe clobbered by the trampoline * * Firmware has preserved x0->x17 for us, we must save/restore the rest to * follow SMC-CC. We save (or retrieve) all the registers as the handler may @@ -1231,10 +1304,11 @@ ENTRY(__sdei_asm_handler) msr sp_el0, x28 /* restore regs >x17 that we clobbered */ - ldp x28, x29, [x19, #SDEI_EVENT_INTREGS + 16 * 14] - ldp lr, x4, [x19, #SDEI_EVENT_INTREGS + S_LR] - mov sp, x4 - ldp x18, x19, [x19, #SDEI_EVENT_INTREGS + 16 * 9] + mov x4, x19 // keep x4 for __sdei_asm_exit_trampoline + ldp x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14] + ldp x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9] + ldp lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR] + mov sp, x1 mov x1, x0 // address to complete_and_resume /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ @@ -1243,14 +1317,16 @@ ENTRY(__sdei_asm_handler) mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME csel x0, x2, x3, ls - /* On success, this call never returns... */ ldr_l x2, sdei_exit_mode - cmp x2, #SDEI_EXIT_SMC - b.ne 1f - smc #0 - b . -1: hvc #0 - b . + +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + sdei_handler_exit exit_mode=x2 +alternative_else_nop_endif + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + br x5 +#endif ENDPROC(__sdei_asm_handler) NOKPROBE(__sdei_asm_handler) #endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index f9dffacaa5d6..6b8d90d5ceae 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include #include @@ -124,7 +126,18 @@ unsigned long sdei_arch_get_entry_point(int conduit) } sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; - return (unsigned long)__sdei_asm_handler; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + if (arm64_kernel_unmapped_at_el0()) { + unsigned long offset; + + offset = (unsigned long)__sdei_asm_entry_trampoline - + (unsigned long)__entry_tramp_text_start; + return TRAMP_VALIAS + offset; + } else +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + return (unsigned long)__sdei_asm_handler; + } /* @@ -138,11 +151,14 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, { u32 mode; int i, err = 0; - const int clobbered_registers = 4; + int clobbered_registers = 4; u64 elr = read_sysreg(elr_el1); u32 kernel_mode = read_sysreg(CurrentEL) | 1; /* +SPSel */ unsigned long vbar = read_sysreg(vbar_el1); + if (arm64_kernel_unmapped_at_el0()) + clobbered_registers++; + /* Retrieve the missing registers values */ for (i = 0; i < clobbered_registers; i++) { /* from within the handler, this call always succeeds */ From 9dfe4828aa32c49856dffd6cd31297f3466caa0d Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 11 Jan 2018 10:11:57 +0000 Subject: [PATCH 071/104] arm64: Re-order reserved_ttbr0 in linker script Currently one resolves the location of the reserved_ttbr0 for PAN by taking a positive offset from swapper_pg_dir. In a future patch we wish to extend the swapper s.t. its size is determined at link time rather than comile time, rendering SWAPPER_DIR_SIZE unsuitable for such a low level calculation. In this patch we re-arrange the order of the linker script s.t. instead one computes reserved_ttbr0 by subtracting RESERVED_TTBR0_SIZE from swapper_pg_dir. Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Signed-off-by: Steve Capper Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 8 ++++---- arch/arm64/include/asm/uaccess.h | 4 ++-- arch/arm64/kernel/vmlinux.lds.S | 5 ++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index f4f234b6155e..8719ce122a38 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -13,11 +13,11 @@ */ #ifdef CONFIG_ARM64_SW_TTBR0_PAN .macro __uaccess_ttbr0_disable, tmp1 - mrs \tmp1, ttbr1_el1 // swapper_pg_dir - add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir - msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + sub \tmp1, \tmp1, #RESERVED_TTBR0_SIZE // reserved_ttbr0 just before swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb - sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE + add \tmp1, \tmp1, #RESERVED_TTBR0_SIZE bic \tmp1, \tmp1, #TTBR_ASID_MASK msr ttbr1_el1, \tmp1 // set reserved ASID isb diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 3821fab01d7d..01ade20d5456 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -108,8 +108,8 @@ static inline void __uaccess_ttbr0_disable(void) unsigned long ttbr; ttbr = read_sysreg(ttbr1_el1); - /* reserved_ttbr0 placed at the end of swapper_pg_dir */ - write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); + /* reserved_ttbr0 placed before swapper_pg_dir */ + write_sysreg(ttbr - RESERVED_TTBR0_SIZE, ttbr0_el1); isb(); /* Set reserved ASID */ ttbr &= ~TTBR_ASID_MASK; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index ddfd3c0942f7..8e567de8f369 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -218,13 +218,12 @@ SECTIONS . = ALIGN(PAGE_SIZE); idmap_pg_dir = .; . += IDMAP_DIR_SIZE; - swapper_pg_dir = .; - . += SWAPPER_DIR_SIZE; - #ifdef CONFIG_ARM64_SW_TTBR0_PAN reserved_ttbr0 = .; . += RESERVED_TTBR0_SIZE; #endif + swapper_pg_dir = .; + . += SWAPPER_DIR_SIZE; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; From 1e1b8c04fa3451e2b7190930adae43c95f0fae31 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 11 Jan 2018 10:11:58 +0000 Subject: [PATCH 072/104] arm64: entry: Move the trampoline to be before PAN The trampoline page tables are positioned after the early page tables in the kernel linker script. As we are about to change the early page table logic to resolve the swapper size at link time as opposed to compile time, the SWAPPER_DIR_SIZE variable (currently used to locate the trampline) will be rendered unsuitable for low level assembler. This patch solves this issue by moving the trampoline before the PAN page tables. The offset to the trampoline from ttbr1 can then be expressed by: PAGE_SIZE + RESERVED_TTBR0_SIZE, which is available to the entry assembler. Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Signed-off-by: Steve Capper Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/vmlinux.lds.S | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 812416a09a9c..1e1d1842888d 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -990,7 +990,7 @@ __ni_sys_trace: .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 - sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 @@ -1009,7 +1009,7 @@ alternative_else_nop_endif .macro tramp_unmap_kernel, tmp mrs \tmp, ttbr1_el1 - add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) orr \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp /* diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 8e567de8f369..4c7112a47469 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -218,6 +218,12 @@ SECTIONS . = ALIGN(PAGE_SIZE); idmap_pg_dir = .; . += IDMAP_DIR_SIZE; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + #ifdef CONFIG_ARM64_SW_TTBR0_PAN reserved_ttbr0 = .; . += RESERVED_TTBR0_SIZE; @@ -225,11 +231,6 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - tramp_pg_dir = .; - . += PAGE_SIZE; -#endif - __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; From 0370b31e48454d8cf11120664aedd1c51b3004cb Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 11 Jan 2018 10:11:59 +0000 Subject: [PATCH 073/104] arm64: Extend early page table code to allow for larger kernels Currently the early assembler page table code assumes that precisely 1xpgd, 1xpud, 1xpmd are sufficient to represent the early kernel text mappings. Unfortunately this is rarely the case when running with a 16KB granule, and we also run into limits with 4KB granule when building much larger kernels. This patch re-writes the early page table logic to compute indices of mappings for each level of page table, and if multiple indices are required, the next-level page table is scaled up accordingly. Also the required size of the swapper_pg_dir is computed at link time to cover the mapping [KIMAGE_ADDR + VOFFSET, _end]. When KASLR is enabled, an extra page is set aside for each level that may require extra entries at runtime. Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Signed-off-by: Steve Capper Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kernel-pgtable.h | 47 +++++++- arch/arm64/include/asm/pgtable.h | 1 + arch/arm64/kernel/head.S | 144 +++++++++++++++++------- arch/arm64/kernel/vmlinux.lds.S | 1 + arch/arm64/mm/mmu.c | 3 +- 5 files changed, 156 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 77a27af01371..82386e860dd2 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -52,7 +52,52 @@ #define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT)) #endif -#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) + +/* + * If KASLR is enabled, then an offset K is added to the kernel address + * space. The bottom 21 bits of this offset are zero to guarantee 2MB + * alignment for PA and VA. + * + * For each pagetable level of the swapper, we know that the shift will + * be larger than 21 (for the 4KB granule case we use section maps thus + * the smallest shift is actually 30) thus there is the possibility that + * KASLR can increase the number of pagetable entries by 1, so we make + * room for this extra entry. + * + * Note KASLR cannot increase the number of required entries for a level + * by more than one because it increments both the virtual start and end + * addresses equally (the extra entry comes from the case where the end + * address is just pushed over a boundary and the start address isn't). + */ + +#ifdef CONFIG_RANDOMIZE_BASE +#define EARLY_KASLR (1) +#else +#define EARLY_KASLR (0) +#endif + +#define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \ + - ((vstart) >> (shift)) + 1 + EARLY_KASLR) + +#define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT)) + +#if SWAPPER_PGTABLE_LEVELS > 3 +#define EARLY_PUDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PUD_SHIFT)) +#else +#define EARLY_PUDS(vstart, vend) (0) +#endif + +#if SWAPPER_PGTABLE_LEVELS > 2 +#define EARLY_PMDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, SWAPPER_TABLE_SHIFT)) +#else +#define EARLY_PMDS(vstart, vend) (0) +#endif + +#define EARLY_PAGES(vstart, vend) ( 1 /* PGDIR page */ \ + + EARLY_PGDS((vstart), (vend)) /* each PGDIR needs a next level page table */ \ + + EARLY_PUDS((vstart), (vend)) /* each PUD needs a next level page table */ \ + + EARLY_PMDS((vstart), (vend))) /* each PMD needs a next level page table */ +#define SWAPPER_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end)) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) #ifdef CONFIG_ARM64_SW_TTBR0_PAN diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bfa237e892f1..54b0a8398055 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -706,6 +706,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; +extern pgd_t swapper_pg_end[]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 66f01869e97c..95748a00eb89 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -191,44 +191,109 @@ ENDPROC(preserve_boot_args) .endm /* - * Macro to populate the PGD (and possibily PUD) for the corresponding - * block entry in the next level (tbl) for the given virtual address. + * Macro to populate page table entries, these entries can be pointers to the next level + * or last level entries pointing to physical memory. * - * Preserves: tbl, next, virt - * Corrupts: ptrs_per_pgd, tmp1, tmp2 + * tbl: page table address + * rtbl: pointer to page table or physical memory + * index: start index to write + * eindex: end index to write - [index, eindex] written to + * flags: flags for pagetable entry to or in + * inc: increment to rtbl between each entry + * tmp1: temporary variable + * + * Preserves: tbl, eindex, flags, inc + * Corrupts: index, tmp1 + * Returns: rtbl */ - .macro create_pgd_entry, tbl, virt, ptrs_per_pgd, tmp1, tmp2 - create_table_entry \tbl, \virt, PGDIR_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 -#if SWAPPER_PGTABLE_LEVELS > 3 - mov \ptrs_per_pgd, PTRS_PER_PUD - create_table_entry \tbl, \virt, PUD_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 -#endif -#if SWAPPER_PGTABLE_LEVELS > 2 - mov \ptrs_per_pgd, PTRS_PER_PTE - create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, \ptrs_per_pgd, \tmp1, \tmp2 -#endif + .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 +.Lpe\@: phys_to_pte \rtbl, \tmp1 + orr \tmp1, \tmp1, \flags // tmp1 = table entry + str \tmp1, [\tbl, \index, lsl #3] + add \rtbl, \rtbl, \inc // rtbl = pa next level + add \index, \index, #1 + cmp \index, \eindex + b.ls .Lpe\@ .endm /* - * Macro to populate block entries in the page table for the start..end - * virtual range (inclusive). + * Compute indices of table entries from virtual address range. If multiple entries + * were needed in the previous page table level then the next page table level is assumed + * to be composed of multiple pages. (This effectively scales the end index). * - * Preserves: tbl, flags - * Corrupts: phys, start, end, tmp, pstate + * vstart: virtual address of start of range + * vend: virtual address of end of range + * shift: shift used to transform virtual address into index + * ptrs: number of entries in page table + * istart: index in table corresponding to vstart + * iend: index in table corresponding to vend + * count: On entry: how many extra entries were required in previous level, scales + * our end index. + * On exit: returns how many extra entries required for next page table level + * + * Preserves: vstart, vend, shift, ptrs + * Returns: istart, iend, count */ - .macro create_block_map, tbl, flags, phys, start, end, tmp - lsr \start, \start, #SWAPPER_BLOCK_SHIFT - and \start, \start, #PTRS_PER_PTE - 1 // table index - bic \phys, \phys, #SWAPPER_BLOCK_SIZE - 1 - lsr \end, \end, #SWAPPER_BLOCK_SHIFT - and \end, \end, #PTRS_PER_PTE - 1 // table end index -9999: phys_to_pte \phys, \tmp - orr \tmp, \tmp, \flags // table entry - str \tmp, [\tbl, \start, lsl #3] // store the entry - add \start, \start, #1 // next entry - add \phys, \phys, #SWAPPER_BLOCK_SIZE // next block - cmp \start, \end - b.ls 9999b + .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count + lsr \iend, \vend, \shift + mov \istart, \ptrs + sub \istart, \istart, #1 + and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) + mov \istart, \ptrs + mul \istart, \istart, \count + add \iend, \iend, \istart // iend += (count - 1) * ptrs + // our entries span multiple tables + + lsr \istart, \vstart, \shift + mov \count, \ptrs + sub \count, \count, #1 + and \istart, \istart, \count + + sub \count, \iend, \istart + .endm + +/* + * Map memory for specified virtual address range. Each level of page table needed supports + * multiple entries. If a level requires n entries the next page table level is assumed to be + * formed from n pages. + * + * tbl: location of page table + * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) + * vstart: start address to map + * vend: end address to map - we map [vstart, vend] + * flags: flags to use to map last level entries + * phys: physical address corresponding to vstart - physical memory is contiguous + * pgds: the number of pgd entries + * + * Temporaries: istart, iend, tmp, count, sv - these need to be different registers + * Preserves: vstart, vend, flags + * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv + */ + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + add \rtbl, \tbl, #PAGE_SIZE + mov \sv, \rtbl + mov \count, #0 + compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + mov \sv, \rtbl + +#if SWAPPER_PGTABLE_LEVELS > 3 + compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + mov \sv, \rtbl +#endif + +#if SWAPPER_PGTABLE_LEVELS > 2 + compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv +#endif + + compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count + bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 + populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm /* @@ -246,14 +311,16 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -318,10 +385,10 @@ __create_page_tables: #endif 1: ldr_l x4, idmap_ptrs_per_pgd - create_pgd_entry x0, x3, x4, x5, x6 mov x5, x3 // __pa(__idmap_text_start) adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - create_block_map x0, x7, x3, x5, x6, x4 + + map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Map the kernel image (starting with PHYS_OFFSET). @@ -330,12 +397,12 @@ __create_page_tables: mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) add x5, x5, x23 // add KASLR displacement mov x4, PTRS_PER_PGD - create_pgd_entry x0, x5, x4, x3, x6 adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - create_block_map x0, x7, x3, x5, x6, x4 + + map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Since the page tables have been populated with non-cacheable @@ -343,7 +410,8 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 dmb sy bl __inval_dcache_area diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 4c7112a47469..0221aca6493d 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -230,6 +230,7 @@ SECTIONS #endif swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; + swapper_pg_end = .; __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 62d7abb2f710..b44992ec9643 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -642,7 +642,8 @@ void __init paging_init(void) * allocated with it. */ memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE, - SWAPPER_DIR_SIZE - PAGE_SIZE); + __pa_symbol(swapper_pg_end) - __pa_symbol(swapper_pg_dir) + - PAGE_SIZE); } /* From bb48711800e6d7aedbf4dfd3367e0ab1270a6bea Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Dec 2017 14:19:37 -0800 Subject: [PATCH 074/104] arm64: cpu_errata: Add Kryo to Falkor 1003 errata The Kryo CPUs are also affected by the Falkor 1003 errata, so we need to do the same workaround on Kryo CPUs. The MIDR is slightly more complicated here, where the PART number is not always the same when looking at all the bits from 15 to 4. Drop the lower 8 bits and just look at the top 4 to see if it's '2' and then consider those as Kryo CPUs. This covers all the combinations without having to list them all out. Fixes: 38fd94b0275c ("arm64: Work around Falkor erratum 1003") Acked-by: Will Deacon Signed-off-by: Stephen Boyd Signed-off-by: Catalin Marinas --- Documentation/arm64/silicon-errata.txt | 2 +- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/cpu_errata.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 304bf22bb83c..b9d93e981a05 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -72,6 +72,6 @@ stable kernels. | Hisilicon | Hip0{6,7} | #161010701 | N/A | | Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | | | | | | -| Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | +| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | | Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index cce5735a677c..2f8d39ed9c2e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -94,6 +94,7 @@ #define BRCM_CPU_PART_VULCAN 0x516 #define QCOM_CPU_PART_FALKOR_V1 0x800 +#define QCOM_CPU_PART_KRYO 0x200 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -106,6 +107,7 @@ #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) #define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 70e5f1826fd9..90a9e465339c 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -30,6 +30,20 @@ is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) entry->midr_range_max); } +static bool __maybe_unused +is_kryo_midr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u32 model; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + model = read_cpuid_id(); + model &= MIDR_IMPLEMENTOR_MASK | (0xf00 << MIDR_PARTNUM_SHIFT) | + MIDR_ARCHITECTURE_MASK; + + return model == entry->midr_model; +} + static bool has_mismatched_cache_line_size(const struct arm64_cpu_capabilities *entry, int scope) @@ -290,6 +304,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_CPU_VAR_REV(0, 0), MIDR_CPU_VAR_REV(0, 0)), }, + { + .desc = "Qualcomm Technologies Kryo erratum 1003", + .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, + .def_scope = SCOPE_LOCAL_CPU, + .midr_model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, #endif #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 { From 67948af41f2e6818edeeba5182811c704d484949 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 9 Jan 2018 16:12:18 +0000 Subject: [PATCH 075/104] arm64: capabilities: Handle duplicate entries for a capability Sometimes a single capability could be listed multiple times with differing matches(), e.g, CPU errata for different MIDR versions. This breaks verify_local_cpu_feature() and this_cpu_has_cap() as we stop checking for a capability on a CPU with the first entry in the given table, which is not sufficient. Make sure we run the checks for all entries of the same capability. We do this by fixing __this_cpu_has_cap() to run through all the entries in the given table for a match and reuse it for verify_local_cpu_feature(). Cc: Mark Rutland Cc: Will Deacon Acked-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 44 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9ef84d0def9a..9f4491d16cfb 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1137,6 +1137,26 @@ static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) cap_set_elf_hwcap(hwcaps); } +/* + * Check if the current CPU has a given feature capability. + * Should be called from non-preemptible context. + */ +static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + unsigned int cap) +{ + const struct arm64_cpu_capabilities *caps; + + if (WARN_ON(preemptible())) + return false; + + for (caps = cap_array; caps->desc; caps++) + if (caps->capability == cap && + caps->matches && + caps->matches(caps, SCOPE_LOCAL_CPU)) + return true; + return false; +} + void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info) { @@ -1200,8 +1220,9 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } static void -verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) +verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list) { + const struct arm64_cpu_capabilities *caps = caps_list; for (; caps->matches; caps++) { if (!cpus_have_cap(caps->capability)) continue; @@ -1209,7 +1230,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) * If the new CPU misses an advertised feature, we cannot proceed * further, park the cpu. */ - if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { + if (!__this_cpu_has_cap(caps_list, caps->capability)) { pr_crit("CPU%d: missing feature: %s\n", smp_processor_id(), caps->desc); cpu_die_early(); @@ -1291,25 +1312,6 @@ static void __init mark_const_caps_ready(void) static_branch_enable(&arm64_const_caps_ready); } -/* - * Check if the current CPU has a given feature capability. - * Should be called from non-preemptible context. - */ -static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, - unsigned int cap) -{ - const struct arm64_cpu_capabilities *caps; - - if (WARN_ON(preemptible())) - return false; - - for (caps = cap_array; caps->desc; caps++) - if (caps->capability == cap && caps->matches) - return caps->matches(caps, SCOPE_LOCAL_CPU); - - return false; -} - extern const struct arm64_cpu_capabilities arm64_errata[]; bool this_cpu_has_cap(unsigned int cap) From a22fde8e979494f326e88ef0728367e009260d75 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 15 Jan 2018 10:51:16 +0000 Subject: [PATCH 076/104] perf: dsu: Use signed field for dsu_pmu->num_counters We set dsu_pmu->num_counters to -1, when the DSU is allocated but not initialised when none of the CPUs are active in the DSU. However, we use an unsigned field for num_counters. Switch this to a signed field. Fixes: 7520fa99246d ("perf: ARM DynamIQ Shared Unit PMU support") Reported-by: Dan Carpenter Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- drivers/perf/arm_dsu_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 37c0526c93d5..93c50e377507 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -120,7 +120,7 @@ struct dsu_pmu { cpumask_t associated_cpus; cpumask_t active_cpu; struct hlist_node cpuhp_node; - u8 num_counters; + s8 num_counters; int irq; DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS); }; From 2f7aacf795e0192648b51674aef90e755e02408c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 15 Jan 2018 10:41:53 +0000 Subject: [PATCH 077/104] firmware: arm_sdei: Fix return value check in sdei_present_dt() In case of error, the function of_platform_device_create() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: 677a60bd2003 ("firmware: arm_sdei: Discover SDEI support via ACPI") Acked-by: James Morse Signed-off-by: Wei Yongjun Signed-off-by: Catalin Marinas --- drivers/firmware/arm_sdei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 8f6563c595e5..1ea71640fdc2 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -1023,7 +1023,7 @@ static bool __init sdei_present_dt(void) pdev = of_platform_device_create(np, sdei_driver.driver.name, NULL); of_node_put(np); - if (IS_ERR(pdev)) + if (!pdev) return false; return true; From 071929dbdd865f779a89ba3f1e06ba8d17dd3743 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Dec 2017 11:28:10 -0800 Subject: [PATCH 078/104] arm64: Stop printing the virtual memory layout Printing kernel addresses should be done in limited circumstances, mostly for debugging purposes. Printing out the virtual memory layout at every kernel bootup doesn't really fall into this category so delete the prints. There are other ways to get the same information. Acked-by: Kees Cook Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5960bef0170d..672094ed7e07 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -599,49 +599,6 @@ void __init mem_init(void) mem_init_print_info(NULL); -#define MLK(b, t) b, t, ((t) - (b)) >> 10 -#define MLM(b, t) b, t, ((t) - (b)) >> 20 -#define MLG(b, t) b, t, ((t) - (b)) >> 30 -#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K) - - pr_notice("Virtual kernel memory layout:\n"); -#ifdef CONFIG_KASAN - pr_notice(" kasan : 0x%16lx - 0x%16lx (%6ld GB)\n", - MLG(KASAN_SHADOW_START, KASAN_SHADOW_END)); -#endif - pr_notice(" modules : 0x%16lx - 0x%16lx (%6ld MB)\n", - MLM(MODULES_VADDR, MODULES_END)); - pr_notice(" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n", - MLG(VMALLOC_START, VMALLOC_END)); - pr_notice(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(_text, _etext)); - pr_notice(" .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(__start_rodata, __init_begin)); - pr_notice(" .init : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(__init_begin, __init_end)); - pr_notice(" .data : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(_sdata, _edata)); - pr_notice(" .bss : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(__bss_start, __bss_stop)); - pr_notice(" fixed : 0x%16lx - 0x%16lx (%6ld KB)\n", - MLK(FIXADDR_START, FIXADDR_TOP)); - pr_notice(" PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n", - MLM(PCI_IO_START, PCI_IO_END)); -#ifdef CONFIG_SPARSEMEM_VMEMMAP - pr_notice(" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n", - MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE)); - pr_notice(" 0x%16lx - 0x%16lx (%6ld MB actual)\n", - MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), - (unsigned long)virt_to_page(high_memory))); -#endif - pr_notice(" memory : 0x%16lx - 0x%16lx (%6ld MB)\n", - MLM(__phys_to_virt(memblock_start_of_DRAM()), - (unsigned long)high_memory)); - -#undef MLK -#undef MLM -#undef MLK_ROUNDUP - /* * Check boundaries twice: Some fundamental inconsistencies can be * detected at build time already. From 6a205420758af5625429a2e77a2654abbf59ac09 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 15 Jan 2018 15:23:48 +0000 Subject: [PATCH 079/104] arm64: fix ID map extension to 52 bits Commit fa2a8445b1d3 added support for extending the ID map to 52 bits, but accidentally dropped a required change to __cpu_uses_extended_idmap. As a result, the kernel fails to boot when VA_BITS = 48 and the ID map text is in 52-bit physical memory, because we reduce TCR.T0SZ to cover the ID map, but then never set it back to VA_BITS. Add back the change, and also clean up some double parentheses. Fixes: fa2a8445b1d3 ("arm64: allow ID map to be extended to 52 bits") Reviewed-by: Suzuki K Poulose Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu_context.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 8f80fcb49252..5aee19905556 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -72,8 +72,7 @@ extern u64 idmap_ptrs_per_pgd; static inline bool __cpu_uses_extended_idmap(void) { - return (!IS_ENABLED(CONFIG_ARM64_VA_BITS_48) && - unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS))); + return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)); } /* @@ -82,7 +81,7 @@ static inline bool __cpu_uses_extended_idmap(void) */ static inline bool __cpu_uses_extended_idmap_level(void) { - return ARM64_HW_PGTABLE_LEVELS((64 - idmap_t0sz)) > CONFIG_PGTABLE_LEVELS; + return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS; } /* From 98732d1b189b626a7593afd02dc3d2dc3f6c545a Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 15 Jan 2018 15:23:49 +0000 Subject: [PATCH 080/104] KVM: arm/arm64: fix HYP ID map extension to 52 bits Commit fa2a8445b1d3 incorrectly masks the index of the HYP ID map pgd entry, causing a non-VHE kernel to hang during boot. This happens when VA_BITS=48 and the ID map text is in 52-bit physical memory. In this case we don't need an extra table level but need more entries in the top-level table, so we need to map into hyp_pgd and need to use __kvm_idmap_ptrs_per_pgd to mask in the extra bits. However, __create_hyp_mappings currently masks by PTRS_PER_PGD instead. Fix it so that we always use __kvm_idmap_ptrs_per_pgd for the HYP ID map. This ensures that we use the larger mask for the top-level ID map table when it has more entries. In all other cases, PTRS_PER_PGD is used as normal. Fixes: fa2a8445b1d3 ("arm64: allow ID map to be extended to 52 bits") Acked-by: Marc Zyngier Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- virt/kvm/arm/mmu.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 761787befd3b..876caf531d32 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -623,21 +623,15 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, return 0; } -static int __create_hyp_mappings(pgd_t *pgdp, +static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; pud_t *pud; - unsigned long addr, next, ptrs_per_pgd = PTRS_PER_PGD; + unsigned long addr, next; int err = 0; - /* - * If it's not the hyp_pgd, fall back to the kvm idmap layout. - */ - if (pgdp != hyp_pgd) - ptrs_per_pgd = __kvm_idmap_ptrs_per_pgd(); - mutex_lock(&kvm_hyp_pgd_mutex); addr = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -705,8 +699,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, virt_addr, - virt_addr + PAGE_SIZE, + err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, + virt_addr, virt_addr + PAGE_SIZE, __phys_to_pfn(phys_addr), prot); if (err) @@ -737,7 +731,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) return -EINVAL; - return __create_hyp_mappings(hyp_pgd, start, end, + return __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end, __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } @@ -1743,7 +1737,7 @@ static int kvm_map_idmap_text(pgd_t *pgd) int err; /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, + err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), hyp_idmap_start, hyp_idmap_end, __phys_to_pfn(hyp_idmap_start), PAGE_HYP_EXEC); From 39610a68d9a9a9c6e71be731d071c405e4f2434b Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 15 Jan 2018 15:23:50 +0000 Subject: [PATCH 081/104] arm64: fix comment above tcr_compute_pa_size The 'pos' argument is used to select where in TCR to write the value: the IPS or PS bitfield. Fixes: 787fd1d019b2 ("arm64: limit PA size to supported range") Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 5dc4856f3bb9..1645e0d3a2c1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -360,7 +360,7 @@ alternative_endif * ID_AA64MMFR0_EL1.PARange value * * tcr: register with the TCR_ELx value to be updated - * pos: PARange bitfield position + * pos: IPS or PS bitfield position * tmp{0,1}: temporary registers */ .macro tcr_compute_pa_size, tcr, pos, tmp0, tmp1 From 894cfd149289c8c9c99211b58a2e3ae7027fff7e Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 29 Nov 2017 15:39:49 -0800 Subject: [PATCH 082/104] arm64: Inform user if software PAN is in use It isn't entirely obvious if we're using software PAN because we don't say anything about it in the boot log. But if we're using hardware PAN we'll print a nice CPU feature message indicating it. Add a print for software PAN too so we know if it's being used or not. Signed-off-by: Stephen Boyd Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f4491d16cfb..a11311397430 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1276,6 +1276,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + + if (system_uses_ttbr0_pan()) + pr_info("Emulating Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); } void check_local_cpu_capabilities(void) From 29d9bef13085e8477f4c9e21aca342edd50847d0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Wed, 10 Jan 2018 19:07:26 +0000 Subject: [PATCH 083/104] arm64: Correct type for PUD macros The PUD macros (PUD_TABLE_BIT, PUD_TYPE_MASK, PUD_TYPE_SECT) use the pgdval_t even when pudval_t is available. Even though the underlying type for both (u64) is the same it is confusing and may lead to issues in the future. Fix this by using pudval_t to define the PUD_* macros. Fixes: 084bd29810a56 ("ARM64: mm: HugeTLB support.") Fixes: 206a2a73a62d3 ("arm64: mm: Create gigabyte kernel logical mappings where possible") Signed-off-by: Punit Agrawal Cc: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index e1f6679d763e..f42836da8723 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -118,9 +118,9 @@ * Level 1 descriptor (PUD). */ #define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0) -#define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) -#define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0) -#define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0) +#define PUD_TABLE_BIT (_AT(pudval_t, 1) << 1) +#define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0) +#define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0) /* * Level 2 descriptor (PMD). From 0abdeff598a66e2bf9bfcb016eb159b11fc2887a Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Dec 2017 18:34:38 +0000 Subject: [PATCH 084/104] arm64: fpsimd: Fix state leakage when migrating after sigreturn When refactoring the sigreturn code to handle SVE, I changed the sigreturn implementation to store the new FPSIMD state from the user sigframe into task_struct before reloading the state into the CPU regs. This makes it easier to convert the data for SVE when needed. However, it turns out that the fpsimd_state structure passed into fpsimd_update_current_state is not fully initialised, so assigning the structure as a whole corrupts current->thread.fpsimd_state.cpu with uninitialised data. This means that if the garbage data written to .cpu happens to be a valid cpu number, and the task is subsequently migrated to the cpu identified by the that number, and then tries to enter userspace, the CPU FPSIMD regs will be assumed to be correct for the task and not reloaded as they should be. This can result in returning to userspace with the FPSIMD registers containing data that is stale or that belongs to another task or to the kernel. Knowingly handing around a kernel structure that is incompletely initialised with user data is a potential source of mistakes, especially across source file boundaries. To help avoid a repeat of this issue, this patch adapts the relevant internal API to hand around the user-accessible subset only: struct user_fpsimd_state. To avoid future surprises, this patch also converts all uses of struct fpsimd_state that really only access the user subset, to use struct user_fpsimd_state. A few missing consts are added to function prototypes for good measure. Thanks to Will for spotting the cause of the bug here. Reported-by: Geert Uytterhoeven Signed-off-by: Dave Martin Cc: Will Deacon Cc: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/fpsimd.h | 2 +- arch/arm64/kernel/fpsimd.c | 4 ++-- arch/arm64/kernel/signal.c | 7 ++++--- arch/arm64/kernel/signal32.c | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 74f34392a531..8857a0f0d0f7 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -71,7 +71,7 @@ extern void fpsimd_flush_thread(void); extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); -extern void fpsimd_update_current_state(struct fpsimd_state *state); +extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); extern void fpsimd_flush_task_state(struct task_struct *target); extern void sve_flush_cpu_state(void); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 540a1e010eb5..55fb544072f6 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1036,14 +1036,14 @@ void fpsimd_restore_current_state(void) * flag that indicates that the FPSIMD register contents are the most recent * FPSIMD state of 'current' */ -void fpsimd_update_current_state(struct fpsimd_state *state) +void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (!system_supports_fpsimd()) return; local_bh_disable(); - current->thread.fpsimd_state = *state; + current->thread.fpsimd_state.user_fpsimd = *state; if (system_supports_sve() && test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b120111a46be..f60c052e8d1c 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -178,7 +178,8 @@ static void __user *apply_user_offset( static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.fpsimd_state.user_fpsimd; int err; /* copy the FP and status/control registers */ @@ -195,7 +196,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) static int restore_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; __u32 magic, size; int err = 0; @@ -266,7 +267,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err; unsigned int vq; - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; struct sve_context sve; if (__copy_from_user(&sve, user->sve, sizeof(sve))) diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 22711ee8e36c..a124140c0926 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -228,7 +228,8 @@ union __fpsimd_vreg { static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.fpsimd_state.user_fpsimd; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr, fpexc; @@ -277,7 +278,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr; From edf298cfce47ab7279d03b5203ae2ef3a58e49db Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:38:54 +0000 Subject: [PATCH 085/104] arm64: cpufeature: __this_cpu_has_cap() shouldn't stop early this_cpu_has_cap() tests caps->desc not caps->matches, so it stops walking the list when it finds a 'silent' feature, instead of walking to the end of the list. Prior to v4.6's 644c2ae198412 ("arm64: cpufeature: Test 'matches' pointer to find the end of the list") we always tested desc to find the end of a capability list. This was changed for dubious things like PAN_NOT_UAO. v4.7's e3661b128e53e ("arm64: Allow a capability to be checked on single CPU") added this_cpu_has_cap() using the old desc style test. CC: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Acked-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a11311397430..630a40ec1332 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1149,9 +1149,8 @@ static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, if (WARN_ON(preemptible())) return false; - for (caps = cap_array; caps->desc; caps++) + for (caps = cap_array; caps->matches; caps++) if (caps->capability == cap && - caps->matches && caps->matches(caps, SCOPE_LOCAL_CPU)) return true; return false; From 7a00d68ebe5f07cb1db17e7fedfd031f0d87e8bb Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:38:55 +0000 Subject: [PATCH 086/104] arm64: sysreg: Move to use definitions for all the SCTLR bits __cpu_setup() configures SCTLR_EL1 using some hard coded hex masks, and el2_setup() duplicates some this when setting RES1 bits. Lets make this the same as KVM's hyp_init, which uses named bits. First, we add definitions for all the SCTLR_EL{1,2} bits, the RES{1,0} bits, and those we want to set or clear. Add a build_bug checks to ensures all bits are either set or clear. This means we don't need to preserve endian-ness configuration generated elsewhere. Finally, move the head.S and proc.S users of these hard-coded masks over to the macro versions. Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 69 +++++++++++++++++++++++++++++++-- arch/arm64/kernel/head.S | 13 ++----- arch/arm64/mm/proc.S | 24 +----------- 3 files changed, 69 insertions(+), 37 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 54e99af043c6..1a8108f84932 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -20,6 +20,7 @@ #ifndef __ASM_SYSREG_H #define __ASM_SYSREG_H +#include #include /* @@ -398,25 +399,81 @@ /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) +#define SCTLR_ELx_WXN (1 << 19) #define SCTLR_ELx_I (1 << 12) #define SCTLR_ELx_SA (1 << 3) #define SCTLR_ELx_C (1 << 2) #define SCTLR_ELx_A (1 << 1) #define SCTLR_ELx_M 1 -#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ - (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \ - (1 << 29)) - #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I) +/* SCTLR_EL2 specific flags. */ +#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ + (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \ + (1 << 29)) +#define SCTLR_EL2_RES0 ((1 << 6) | (1 << 7) | (1 << 8) | (1 << 9) | \ + (1 << 10) | (1 << 13) | (1 << 14) | (1 << 15) | \ + (1 << 17) | (1 << 20) | (1 << 21) | (1 << 24) | \ + (1 << 26) | (1 << 27) | (1 << 30) | (1 << 31)) + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define ENDIAN_SET_EL2 SCTLR_ELx_EE +#define ENDIAN_CLEAR_EL2 0 +#else +#define ENDIAN_SET_EL2 0 +#define ENDIAN_CLEAR_EL2 SCTLR_ELx_EE +#endif + +/* SCTLR_EL2 value used for the hyp-stub */ +#define SCTLR_EL2_SET (ENDIAN_SET_EL2 | SCTLR_EL2_RES1) +#define SCTLR_EL2_CLEAR (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ + SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_WXN | \ + ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0) + +/* Check all the bits are accounted for */ +#define SCTLR_EL2_BUILD_BUG_ON_MISSING_BITS BUILD_BUG_ON((SCTLR_EL2_SET ^ SCTLR_EL2_CLEAR) != ~0) + + /* SCTLR_EL1 specific flags. */ #define SCTLR_EL1_UCI (1 << 26) +#define SCTLR_EL1_E0E (1 << 24) #define SCTLR_EL1_SPAN (1 << 23) +#define SCTLR_EL1_NTWE (1 << 18) +#define SCTLR_EL1_NTWI (1 << 16) #define SCTLR_EL1_UCT (1 << 15) +#define SCTLR_EL1_DZE (1 << 14) +#define SCTLR_EL1_UMA (1 << 9) #define SCTLR_EL1_SED (1 << 8) +#define SCTLR_EL1_ITD (1 << 7) #define SCTLR_EL1_CP15BEN (1 << 5) +#define SCTLR_EL1_SA0 (1 << 4) + +#define SCTLR_EL1_RES1 ((1 << 11) | (1 << 20) | (1 << 22) | (1 << 28) | \ + (1 << 29)) +#define SCTLR_EL1_RES0 ((1 << 6) | (1 << 10) | (1 << 13) | (1 << 17) | \ + (1 << 21) | (1 << 27) | (1 << 30) | (1 << 31)) + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define ENDIAN_SET_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE) +#define ENDIAN_CLEAR_EL1 0 +#else +#define ENDIAN_SET_EL1 0 +#define ENDIAN_CLEAR_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE) +#endif + +#define SCTLR_EL1_SET (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |\ + SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\ + SCTLR_EL1_DZE | SCTLR_EL1_UCT | SCTLR_EL1_NTWI |\ + SCTLR_EL1_NTWE | SCTLR_EL1_SPAN | ENDIAN_SET_EL1 |\ + SCTLR_EL1_UCI | SCTLR_EL1_RES1) +#define SCTLR_EL1_CLEAR (SCTLR_ELx_A | SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD |\ + SCTLR_EL1_UMA | SCTLR_ELx_WXN | ENDIAN_CLEAR_EL1 |\ + SCTLR_EL1_RES0) + +/* Check all the bits are accounted for */ +#define SCTLR_EL1_BUILD_BUG_ON_MISSING_BITS BUILD_BUG_ON((SCTLR_EL1_SET ^ SCTLR_EL1_CLEAR) != ~0) /* id_aa64isar0 */ #define ID_AA64ISAR0_FHM_SHIFT 48 @@ -593,6 +650,7 @@ #else +#include #include asm( @@ -649,6 +707,9 @@ static inline void config_sctlr_el1(u32 clear, u32 set) { u32 val; + SCTLR_EL2_BUILD_BUG_ON_MISSING_BITS; + SCTLR_EL1_BUILD_BUG_ON_MISSING_BITS; + val = read_sysreg(sctlr_el1); val &= ~clear; val |= set; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 95748a00eb89..c3b241b8b659 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -492,17 +492,13 @@ ENTRY(el2_setup) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f - mrs x0, sctlr_el1 -CPU_BE( orr x0, x0, #(3 << 24) ) // Set the EE and E0E bits for EL1 -CPU_LE( bic x0, x0, #(3 << 24) ) // Clear the EE and E0E bits for EL1 + mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) msr sctlr_el1, x0 mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb ret -1: mrs x0, sctlr_el2 -CPU_BE( orr x0, x0, #(1 << 25) ) // Set the EE bit for EL2 -CPU_LE( bic x0, x0, #(1 << 25) ) // Clear the EE bit for EL2 +1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) msr sctlr_el2, x0 #ifdef CONFIG_ARM64_VHE @@ -618,10 +614,7 @@ install_el2_stub: * requires no configuration, and all non-hyp-specific EL2 setup * will be done via the _EL1 system register aliases in __cpu_setup. */ - /* sctlr_el1 */ - mov x0, #0x0800 // Set/clear RES{1,0} bits -CPU_BE( movk x0, #0x33d0, lsl #16 ) // Set EE and E0E on BE systems -CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems + mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) msr sctlr_el1, x0 /* Coprocessor traps. */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 5a59eea49395..ec4c3d82b4d6 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -226,11 +226,7 @@ ENTRY(__cpu_setup) /* * Prepare SCTLR */ - adr x5, crval - ldp w5, w6, [x5] - mrs x0, sctlr_el1 - bic x0, x0, x5 // clear bits - orr x0, x0, x6 // set bits + mov_q x0, SCTLR_EL1_SET /* * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for * both user and kernel. @@ -259,21 +255,3 @@ ENTRY(__cpu_setup) msr tcr_el1, x10 ret // return to head.S ENDPROC(__cpu_setup) - - /* - * We set the desired value explicitly, including those of the - * reserved bits. The values of bits EE & E0E were set early in - * el2_setup, which are left untouched below. - * - * n n T - * U E WT T UD US IHBS - * CE0 XWHW CZ ME TEEA S - * .... .IEE .... NEAI TE.I ..AD DEN0 ACAM - * 0011 0... 1101 ..0. ..0. 10.. .0.. .... < hardware reserved - * .... .1.. .... 01.1 11.1 ..01 0.01 1101 < software settings - */ - .type crval, #object -crval: - .word 0xfcffffff // clear - .word 0x34d5d91d // set - .popsection From 64c02720ea3598bf5143b672274d923a941b8053 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 15 Jan 2018 19:38:56 +0000 Subject: [PATCH 087/104] arm64: cpufeature: Detect CPU RAS Extentions ARM's v8.2 Extentions add support for Reliability, Availability and Serviceability (RAS). On CPUs with these extensions system software can use additional barriers to isolate errors and determine if faults are pending. Add cpufeature detection. Platform level RAS support may require additional firmware support. Reviewed-by: Suzuki K Poulose Signed-off-by: Xie XiuQi [Rebased added config option, reworded commit message] Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 16 ++++++++++++++++ arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 2 ++ arch/arm64/kernel/cpufeature.c | 13 +++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 664fadc2aa2e..1d51c8edf34b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1062,6 +1062,22 @@ config ARM64_PMEM operations if DC CVAP is not supported (following the behaviour of DC CVAP itself if the system does not define a point of persistence). +config ARM64_RAS_EXTN + bool "Enable support for RAS CPU Extensions" + default y + help + CPUs that support the Reliability, Availability and Serviceability + (RAS) Extensions, part of ARMv8.2 are able to track faults and + errors, classify them and report them to software. + + On CPUs with these extensions system software can use additional + barriers to determine if faults are pending and read the + classification from a new set of registers. + + Selecting this feature will allow the kernel to use these barriers + and access the new registers if the system supports the extension. + Platform RAS features may additionally depend on firmware support. + endmenu config ARM64_SVE diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 7049b4802587..bb263820de13 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -44,7 +44,8 @@ #define ARM64_UNMAP_KERNEL_AT_EL0 23 #define ARM64_HARDEN_BRANCH_PREDICTOR 24 #define ARM64_HARDEN_BP_POST_GUEST_EXIT 25 +#define ARM64_HAS_RAS_EXTN 26 -#define ARM64_NCAPS 26 +#define ARM64_NCAPS 27 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 1a8108f84932..321622e9f9c3 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -498,6 +498,7 @@ #define ID_AA64PFR0_CSV3_SHIFT 60 #define ID_AA64PFR0_CSV2_SHIFT 56 #define ID_AA64PFR0_SVE_SHIFT 32 +#define ID_AA64PFR0_RAS_SHIFT 28 #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_ASIMD_SHIFT 20 #define ID_AA64PFR0_FP_SHIFT 16 @@ -507,6 +508,7 @@ #define ID_AA64PFR0_EL0_SHIFT 0 #define ID_AA64PFR0_SVE 0x1 +#define ID_AA64PFR0_RAS_V1 0x1 #define ID_AA64PFR0_FP_NI 0xf #define ID_AA64PFR0_FP_SUPPORTED 0x0 #define ID_AA64PFR0_ASIMD_NI 0xf diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 630a40ec1332..116a5c9443a7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -149,6 +149,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), @@ -1028,6 +1029,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .enable = sve_kernel_enable, }, #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_RAS_EXTN + { + .desc = "RAS Extension Support", + .capability = ARM64_HAS_RAS_EXTN, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_RAS_SHIFT, + .min_field_value = ID_AA64PFR0_RAS_V1, + }, +#endif /* CONFIG_ARM64_RAS_EXTN */ {}, }; From 6bf0dcfd713563bd2e13ceb53217305c28a8aa5f Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:38:57 +0000 Subject: [PATCH 088/104] arm64: kernel: Survive corrected RAS errors notified by SError Prior to v8.2, SError is an uncontainable fatal exception. The v8.2 RAS extensions use SError to notify software about RAS errors, these can be contained by the Error Syncronization Barrier. An ACPI system with firmware-first may use SError as its 'SEI' notification. Future patches may add code to 'claim' this SError as a notification. Other systems can distinguish these RAS errors from the SError ESR and use the AET bits and additional data from RAS-Error registers to handle the error. Future patches may add this kernel-first handling. Without support for either of these we will panic(), even if we received a corrected error. Add code to decode the severity of RAS errors. We can safely ignore contained errors where the CPU can continue to make progress. For all other errors we continue to panic(). Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/esr.h | 13 ++++++++ arch/arm64/include/asm/traps.h | 54 ++++++++++++++++++++++++++++++++++ arch/arm64/kernel/traps.c | 51 ++++++++++++++++++++++++++++---- 3 files changed, 113 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 014d7d8edcf9..c367838700fa 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -86,6 +86,18 @@ #define ESR_ELx_WNR_SHIFT (6) #define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) +/* Asynchronous Error Type */ +#define ESR_ELx_IDS_SHIFT (24) +#define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) +#define ESR_ELx_AET_SHIFT (10) +#define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) + +#define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) + /* Shared ISS field definitions for Data/Instruction aborts */ #define ESR_ELx_SET_SHIFT (11) #define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) @@ -100,6 +112,7 @@ #define ESR_ELx_FSC (0x3F) #define ESR_ELx_FSC_TYPE (0x3C) #define ESR_ELx_FSC_EXTABT (0x10) +#define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 1696f9de9359..178e338d2889 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -19,6 +19,7 @@ #define __ASM_TRAP_H #include +#include #include struct pt_regs; @@ -66,4 +67,57 @@ static inline int in_entry_text(unsigned long ptr) return ptr >= (unsigned long)&__entry_text_start && ptr < (unsigned long)&__entry_text_end; } + +/* + * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit + * to indicate whether this ESR has a RAS encoding. CPUs without this feature + * have a ISS-Valid bit in the same position. + * If this bit is set, we know its not a RAS SError. + * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS + * errors share the same encoding as an all-zeros encoding from a CPU that + * doesn't support RAS. + */ +static inline bool arm64_is_ras_serror(u32 esr) +{ + WARN_ON(preemptible()); + + if (esr & ESR_ELx_IDS) + return false; + + if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) + return true; + else + return false; +} + +/* + * Return the AET bits from a RAS SError's ESR. + * + * It is implementation defined whether Uncategorized errors are containable. + * We treat them as Uncontainable. + * Non-RAS SError's are reported as Uncontained/Uncategorized. + */ +static inline u32 arm64_ras_serror_get_severity(u32 esr) +{ + u32 aet = esr & ESR_ELx_AET; + + if (!arm64_is_ras_serror(esr)) { + /* Not a RAS error, we can't interpret the ESR. */ + return ESR_ELx_AET_UC; + } + + /* + * AET is RES0 if 'the value returned in the DFSC field is not + * [ESR_ELx_FSC_SERROR]' + */ + if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) { + /* No severity information : Uncategorized */ + return ESR_ELx_AET_UC; + } + + return aet; +} + +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr); +void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr); #endif diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3d3588fcd1c7..bbb0fde2780e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -662,17 +662,58 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) } #endif -asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) { - nmi_enter(); - console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); - __show_regs(regs); + if (regs) + __show_regs(regs); - panic("Asynchronous SError Interrupt"); + nmi_panic(regs, "Asynchronous SError Interrupt"); + + cpu_park_loop(); + unreachable(); +} + +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) +{ + u32 aet = arm64_ras_serror_get_severity(esr); + + switch (aet) { + case ESR_ELx_AET_CE: /* corrected error */ + case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ + /* + * The CPU can make progress. We may take UEO again as + * a more severe error. + */ + return false; + + case ESR_ELx_AET_UEU: /* Uncorrected Unrecoverable */ + case ESR_ELx_AET_UER: /* Uncorrected Recoverable */ + /* + * The CPU can't make progress. The exception may have + * been imprecise. + */ + return true; + + case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ + default: + /* Error has been silently propagated */ + arm64_serror_panic(regs, esr); + } +} + +asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +{ + nmi_enter(); + + /* non-RAS errors are not containable */ + if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) + arm64_serror_panic(regs, esr); + + nmi_exit(); } void __pte_error(const char *file, int line, unsigned long val) From f751daa4f9d3da07e2777ea0c1ba2d58ff2c860f Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:38:58 +0000 Subject: [PATCH 089/104] arm64: Unconditionally enable IESB on exception entry/return for firmware-first ARM v8.2 has a feature to add implicit error synchronization barriers whenever the CPU enters or returns from an exception level. Add this to the features we always enable. CPUs that don't support this feature will treat the bit as RES0. This feature causes RAS errors that are not yet visible to software to become pending SErrors. We expect to have firmware-first RAS support so synchronised RAS errors will be take immediately to EL3. Any system without firmware-first handling of errors will take the SError either immediatly after exception return, or when we unmask SError after entry.S's work. Adding IESB to the ELx flags causes it to be enabled by KVM and kexec too. Platform level RAS support may require additional firmware support. Cc: Christoffer Dall Suggested-by: Will Deacon Link: https://www.spinics.net/lists/kvm-arm/msg28192.html Acked-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 321622e9f9c3..1281bc8263c2 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -399,6 +399,7 @@ /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) +#define SCTLR_ELx_IESB (1 << 21) #define SCTLR_ELx_WXN (1 << 19) #define SCTLR_ELx_I (1 << 12) #define SCTLR_ELx_SA (1 << 3) @@ -406,8 +407,8 @@ #define SCTLR_ELx_A (1 << 1) #define SCTLR_ELx_M 1 -#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ - SCTLR_ELx_SA | SCTLR_ELx_I) +#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ + SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB) /* SCTLR_EL2 specific flags. */ #define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ @@ -415,8 +416,8 @@ (1 << 29)) #define SCTLR_EL2_RES0 ((1 << 6) | (1 << 7) | (1 << 8) | (1 << 9) | \ (1 << 10) | (1 << 13) | (1 << 14) | (1 << 15) | \ - (1 << 17) | (1 << 20) | (1 << 21) | (1 << 24) | \ - (1 << 26) | (1 << 27) | (1 << 30) | (1 << 31)) + (1 << 17) | (1 << 20) | (1 << 24) | (1 << 26) | \ + (1 << 27) | (1 << 30) | (1 << 31)) #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL2 SCTLR_ELx_EE @@ -427,7 +428,7 @@ #endif /* SCTLR_EL2 value used for the hyp-stub */ -#define SCTLR_EL2_SET (ENDIAN_SET_EL2 | SCTLR_EL2_RES1) +#define SCTLR_EL2_SET (SCTLR_ELx_IESB | ENDIAN_SET_EL2 | SCTLR_EL2_RES1) #define SCTLR_EL2_CLEAR (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_WXN | \ ENDIAN_CLEAR_EL2 | SCTLR_EL2_RES0) @@ -453,7 +454,7 @@ #define SCTLR_EL1_RES1 ((1 << 11) | (1 << 20) | (1 << 22) | (1 << 28) | \ (1 << 29)) #define SCTLR_EL1_RES0 ((1 << 6) | (1 << 10) | (1 << 13) | (1 << 17) | \ - (1 << 21) | (1 << 27) | (1 << 30) | (1 << 31)) + (1 << 27) | (1 << 30) | (1 << 31)) #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE) @@ -466,8 +467,8 @@ #define SCTLR_EL1_SET (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |\ SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\ SCTLR_EL1_DZE | SCTLR_EL1_UCT | SCTLR_EL1_NTWI |\ - SCTLR_EL1_NTWE | SCTLR_EL1_SPAN | ENDIAN_SET_EL1 |\ - SCTLR_EL1_UCI | SCTLR_EL1_RES1) + SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\ + ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1) #define SCTLR_EL1_CLEAR (SCTLR_ELx_A | SCTLR_EL1_CP15BEN | SCTLR_EL1_ITD |\ SCTLR_EL1_UMA | SCTLR_ELx_WXN | ENDIAN_CLEAR_EL1 |\ SCTLR_EL1_RES0) From 68ddbf09ec5a888ec850edd7e7438d2daf069c56 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:38:59 +0000 Subject: [PATCH 090/104] arm64: kernel: Prepare for a DISR user KVM would like to consume any pending SError (or RAS error) after guest exit. Today it has to unmask SError and use dsb+isb to synchronise the CPU. With the RAS extensions we can use ESB to synchronise any pending SError. Add the necessary macros to allow DISR to be read and converted to an ESR. We clear the DISR register when we enable the RAS cpufeature, and the kernel has not executed any ESB instructions. Any value we find in DISR must have belonged to firmware. Executing an ESB instruction is the only way to update DISR, so we can expect firmware to have handled any deferred SError. By the same logic we clear DISR in the idle path. Reviewed-by: Suzuki K Poulose Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 7 +++++++ arch/arm64/include/asm/esr.h | 7 +++++++ arch/arm64/include/asm/exception.h | 14 ++++++++++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/cpufeature.c | 9 +++++++++ arch/arm64/mm/proc.S | 5 +++++ 7 files changed, 44 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 1645e0d3a2c1..794fe8122602 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -108,6 +108,13 @@ dmb \opt .endm +/* + * RAS Error Synchronization barrier + */ + .macro esb + hint #16 + .endm + /* * NOP sequence */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index c367838700fa..803443d74926 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -140,6 +140,13 @@ #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) +#define DISR_EL1_IDS (UL(1) << 24) +/* + * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean + * different things in the future... + */ +#define DISR_EL1_ESR_MASK (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC) + /* ESR value templates for specific events */ /* BRK instruction trap from AArch64 state */ diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0c2eec490abf..bc30429d8e91 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -18,6 +18,8 @@ #ifndef __ASM_EXCEPTION_H #define __ASM_EXCEPTION_H +#include + #include #define __exception __attribute__((section(".exception.text"))) @@ -27,4 +29,16 @@ #define __exception_irq_entry __exception #endif +static inline u32 disr_to_esr(u64 disr) +{ + unsigned int esr = ESR_ELx_EC_SERROR << ESR_ELx_EC_SHIFT; + + if ((disr & DISR_EL1_IDS) == 0) + esr |= (disr & DISR_EL1_ESR_MASK); + else + esr |= (disr & ESR_ELx_ISS_MASK); + + return esr; +} + #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 023cacb946c3..cee4ae25a5d1 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -216,6 +216,7 @@ static inline void spin_lock_prefetch(const void *ptr) int cpu_enable_pan(void *__unused); int cpu_enable_cache_maint_trap(void *__unused); +int cpu_clear_disr(void *__unused); /* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */ #define SVE_SET_VL(arg) sve_set_current_vl(arg) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 1281bc8263c2..b5d543fc677d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -279,6 +279,7 @@ #define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0) #define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) +#define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1) #define SYS_ICC_IAR0_EL1 sys_reg(3, 0, 12, 8, 0) #define SYS_ICC_EOIR0_EL1 sys_reg(3, 0, 12, 8, 1) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 116a5c9443a7..5612d6f46331 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1039,6 +1039,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_RAS_SHIFT, .min_field_value = ID_AA64PFR0_RAS_V1, + .enable = cpu_clear_disr, }, #endif /* CONFIG_ARM64_RAS_EXTN */ {}, @@ -1470,3 +1471,11 @@ static int __init enable_mrs_emulation(void) } core_initcall(enable_mrs_emulation); + +int cpu_clear_disr(void *__unused) +{ + /* Firmware may have left a deferred SError in this register. */ + write_sysreg_s(0, SYS_DISR_EL1); + + return 0; +} diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index ec4c3d82b4d6..05e4ae934b23 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -132,6 +132,11 @@ alternative_endif ubfx x11, x11, #1, #1 msr oslar_el1, x11 reset_pmuserenr_el0 x0 // Disable PMU access from EL0 + +alternative_if ARM64_HAS_RAS_EXTN + msr_s SYS_DISR_EL1, xzr +alternative_else_nop_endif + isb ret ENDPROC(cpu_do_resume) From 4f5abad9e826bd579b0661efa32682d9c9bc3fa8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:00 +0000 Subject: [PATCH 091/104] KVM: arm/arm64: mask/unmask daif around VHE guests Non-VHE systems take an exception to EL2 in order to world-switch into the guest. When returning from the guest KVM implicitly restores the DAIF flags when it returns to the kernel at EL1. With VHE none of this exception-level jumping happens, so KVMs world-switch code is exposed to the host kernel's DAIF values, and KVM spills the guest-exit DAIF values back into the host kernel. On entry to a guest we have Debug and SError exceptions unmasked, KVM has switched VBAR but isn't prepared to handle these. On guest exit Debug exceptions are left disabled once we return to the host and will stay this way until we enter user space. Add a helper to mask/unmask DAIF around VHE guests. The unmask can only happen after the hosts VBAR value has been synchronised by the isb in __vhe_hyp_call (via kvm_call_hyp()). Masking could be as late as setting KVMs VBAR value, but is kept here for symmetry. Acked-by: Marc Zyngier Signed-off-by: James Morse Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm/include/asm/kvm_host.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 10 ++++++++++ virt/kvm/arm/arm.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index a9f7d3f47134..b86fc4162539 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -301,4 +301,6 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, /* All host FP/SIMD state is restored on guest exit, so nothing to save: */ static inline void kvm_fpsimd_flush_cpu_state(void) {} +static inline void kvm_arm_vhe_guest_enter(void) {} +static inline void kvm_arm_vhe_guest_exit(void) {} #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7ee72b402907..dcdd08edf5a5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -398,4 +399,13 @@ static inline void kvm_fpsimd_flush_cpu_state(void) sve_flush_cpu_state(); } +static inline void kvm_arm_vhe_guest_enter(void) +{ + local_daif_mask(); +} + +static inline void kvm_arm_vhe_guest_exit(void) +{ + local_daif_restore(DAIF_PROCCTX_NOIRQ); +} #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 2fc6009a766c..38e81631fc91 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -704,9 +704,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ trace_kvm_entry(*vcpu_pc(vcpu)); guest_enter_irqoff(); + if (has_vhe()) + kvm_arm_vhe_guest_enter(); ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); + if (has_vhe()) + kvm_arm_vhe_guest_exit(); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; /* From 4715c14bc136687bb79d12e24aafdc0f38786eb7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:01 +0000 Subject: [PATCH 092/104] KVM: arm64: Set an impdef ESR for Virtual-SError using VSESR_EL2. Prior to v8.2's RAS Extensions, the HCR_EL2.VSE 'virtual SError' feature generated an SError with an implementation defined ESR_EL1.ISS, because we had no mechanism to specify the ESR value. On Juno this generates an all-zero ESR, the most significant bit 'ISV' is clear indicating the remainder of the ISS field is invalid. With the RAS Extensions we have a mechanism to specify this value, and the most significant bit has a new meaning: 'IDS - Implementation Defined Syndrome'. An all-zero SError ESR now means: 'RAS error: Uncategorized' instead of 'no valid ISS'. Add KVM support for the VSESR_EL2 register to specify an ESR value when HCR_EL2.VSE generates a virtual SError. Change kvm_inject_vabt() to specify an implementation-defined value. We only need to restore the VSESR_EL2 value when HCR_EL2.VSE is set, KVM save/restores this bit during __{,de}activate_traps() and hardware clears the bit once the guest has consumed the virtual-SError. Future patches may add an API (or KVM CAP) to pend a virtual SError with a specified ESR. Cc: Dongjiu Geng Reviewed-by: Marc Zyngier Signed-off-by: James Morse Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/hyp/switch.c | 3 +++ arch/arm64/kvm/inject_fault.c | 13 ++++++++++++- 5 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5f28dfa14cee..6d3614795197 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -64,6 +64,11 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr) vcpu->arch.hcr_el2 = hcr; } +static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr) +{ + vcpu->arch.vsesr_el2 = vsesr; +} + static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index dcdd08edf5a5..3014b39b8fe2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -280,6 +280,9 @@ struct kvm_vcpu_arch { /* Detect first run of a vcpu */ bool has_run_once; + + /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ + u64 vsesr_el2; }; #define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b5d543fc677d..52cfdc216bcf 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -355,6 +355,7 @@ #define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0) #define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1) +#define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 324f4202cdd5..b425b8aab45b 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -94,6 +94,9 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) write_sysreg(val, hcr_el2); + if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (val & HCR_VSE)) + write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); + /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); /* diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 8ecbcb40e317..60666a056944 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -164,14 +164,25 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) inject_undef64(vcpu); } +static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr) +{ + vcpu_set_vsesr(vcpu, esr); + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE); +} + /** * kvm_inject_vabt - inject an async abort / SError into the guest * @vcpu: The VCPU to receive the exception * * It is assumed that this code is called from the VCPU thread and that the * VCPU therefore is not currently executing guest code. + * + * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with + * the remaining ISS all-zeros so that this error is not interpreted as an + * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR + * value, so the CPU generates an imp-def value. */ void kvm_inject_vabt(struct kvm_vcpu *vcpu) { - vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE); + pend_guest_serror(vcpu, ESR_ELx_ISV); } From c773ae2b34760a1ae409614aa31cdded81a645a5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:02 +0000 Subject: [PATCH 093/104] KVM: arm64: Save/Restore guest DISR_EL1 If we deliver a virtual SError to the guest, the guest may defer it with an ESB instruction. The guest reads the deferred value via DISR_EL1, but the guests view of DISR_EL1 is re-mapped to VDISR_EL2 when HCR_EL2.AMO is set. Add the KVM code to save/restore VDISR_EL2, and make it accessible to userspace as DISR_EL1. Signed-off-by: James Morse Reviewed-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/hyp/sysreg-sr.c | 6 ++++++ arch/arm64/kvm/sys_regs.c | 1 + 4 files changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3014b39b8fe2..84fcb2a896a1 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -121,6 +121,7 @@ enum vcpu_sysreg { PAR_EL1, /* Physical Address Register */ MDSCR_EL1, /* Monitor Debug System Control Register */ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */ + DISR_EL1, /* Deferred Interrupt Status Register */ /* Performance Monitors Registers */ PMCR_EL0, /* Control Register */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 52cfdc216bcf..53d99c0944b1 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -358,6 +358,7 @@ #define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) +#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) #define SYS_ICH_AP0R0_EL2 __SYS__AP0Rx_EL2(0) #define SYS_ICH_AP0R1_EL2 __SYS__AP0Rx_EL2(1) diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index c54cc2afb92b..2c17afd2be96 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -66,6 +66,9 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr); ctxt->gp_regs.regs.pc = read_sysreg_el2(elr); ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr); + + if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) + ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2); } static hyp_alternate_select(__sysreg_call_save_host_state, @@ -119,6 +122,9 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr); write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); + + if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) + write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); } static hyp_alternate_select(__sysreg_call_restore_host_state, diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1830ebc227d1..9edf4ac8a320 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1169,6 +1169,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, From c60590b552bdf682043579b9b965e6224fbf65d9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:03 +0000 Subject: [PATCH 094/104] KVM: arm64: Save ESR_EL2 on guest SError When we exit a guest due to an SError the vcpu fault info isn't updated with the ESR. Today this is only done for traps. The v8.2 RAS Extensions define ISS values for SError. Update the vcpu's fault_info with the ESR on SError so that handle_exit() can determine if this was a RAS SError and decode its severity. Acked-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/kvm/hyp/switch.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index b425b8aab45b..036e1f3d77a6 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -239,11 +239,12 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) { - u64 esr = read_sysreg_el2(esr); - u8 ec = ESR_ELx_EC(esr); + u8 ec; + u64 esr; u64 hpfar, far; - vcpu->arch.fault.esr_el2 = esr; + esr = vcpu->arch.fault.esr_el2; + ec = ESR_ELx_EC(esr); if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) return true; @@ -336,6 +337,8 @@ again: exit_code = __guest_enter(vcpu, host_ctxt); /* And we're baaack! */ + if (ARM_EXCEPTION_CODE(exit_code) != ARM_EXCEPTION_IRQ) + vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr); /* * We're using the raw exception code in order to only process * the trap if no SError is pending. We will come back to the From 3368bd809764d3ef0810e16c1e1531fec32e8d8e Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:04 +0000 Subject: [PATCH 095/104] KVM: arm64: Handle RAS SErrors from EL1 on guest exit We expect to have firmware-first handling of RAS SErrors, with errors notified via an APEI method. For systems without firmware-first, add some minimal handling to KVM. There are two ways KVM can take an SError due to a guest, either may be a RAS error: we exit the guest due to an SError routed to EL2 by HCR_EL2.AMO, or we take an SError from EL2 when we unmask PSTATE.A from __guest_exit. For SError that interrupt a guest and are routed to EL2 the existing behaviour is to inject an impdef SError into the guest. Add code to handle RAS SError based on the ESR. For uncontained and uncategorized errors arm64_is_fatal_ras_serror() will panic(), these errors compromise the host too. All other error types are contained: For the fatal errors the vCPU can't make progress, so we inject a virtual SError. We ignore contained errors where we can make progress as if we're lucky, we may not hit them again. If only some of the CPUs support RAS the guest will see the cpufeature sanitised version of the id registers, but we may still take RAS SError on this CPU. Move the SError handling out of handle_exit() into a new handler that runs before we can be preempted. This allows us to use this_cpu_has_cap(), via arm64_is_ras_serror(). Acked-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/handle_exit.c | 18 +++++++++++++++++- virt/kvm/arm/arm.c | 3 +++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index b86fc4162539..acbf9ec7b396 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -238,6 +238,9 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); +static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, + int exception_index) {} + static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 84fcb2a896a1..abcfd164e690 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -347,6 +347,8 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); +void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, + int exception_index); int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 304203fa9e33..6a5a5db4292f 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -29,12 +29,19 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); +static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) +{ + if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) + kvm_inject_vabt(vcpu); +} + static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; @@ -252,7 +259,6 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: - kvm_inject_vabt(vcpu); /* We may still need to return for single-step */ if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) && kvm_arm_handle_step_debug(vcpu, run)) @@ -275,3 +281,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, return 0; } } + +/* For exit types that need handling before we can be preempted */ +void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, + int exception_index) +{ + exception_index = ARM_EXCEPTION_CODE(exception_index); + + if (exception_index == ARM_EXCEPTION_EL1_SERROR) + kvm_handle_guest_serror(vcpu, kvm_vcpu_get_hsr(vcpu)); +} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 38e81631fc91..15bf026eb182 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -763,6 +763,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, run, ret); + preempt_enable(); ret = handle_exit(vcpu, run, ret); From 0067df413bd9d7e9ee3a78ece1e1a93535378862 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jan 2018 19:39:05 +0000 Subject: [PATCH 096/104] KVM: arm64: Handle RAS SErrors from EL2 on guest exit We expect to have firmware-first handling of RAS SErrors, with errors notified via an APEI method. For systems without firmware-first, add some minimal handling to KVM. There are two ways KVM can take an SError due to a guest, either may be a RAS error: we exit the guest due to an SError routed to EL2 by HCR_EL2.AMO, or we take an SError from EL2 when we unmask PSTATE.A from __guest_exit. The current SError from EL2 code unmasks SError and tries to fence any pending SError into a single instruction window. It then leaves SError unmasked. With the v8.2 RAS Extensions we may take an SError for a 'corrected' error, but KVM is only able to handle SError from EL2 if they occur during this single instruction window... The RAS Extensions give us a new instruction to synchronise and consume SErrors. The RAS Extensions document (ARM DDI0587), '2.4.1 ESB and Unrecoverable errors' describes ESB as synchronising SError interrupts generated by 'instructions, translation table walks, hardware updates to the translation tables, and instruction fetches on the same PE'. This makes ESB equivalent to KVMs existing 'dsb, mrs-daifclr, isb' sequence. Use the alternatives to synchronise and consume any SError using ESB instead of unmasking and taking the SError. Set ARM_EXIT_WITH_SERROR_BIT in the exit_code so that we can restart the vcpu if it turns out this SError has no impact on the vcpu. Reviewed-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kvm/handle_exit.c | 14 +++++++++++++- arch/arm64/kvm/hyp/entry.S | 13 +++++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 6d3614795197..e002ab7f919a 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -176,6 +176,11 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu) return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8; } +static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.fault.disr_el1; +} + static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_xVC_IMM_MASK; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index abcfd164e690..4485ae8e98de 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -90,6 +90,7 @@ struct kvm_vcpu_fault_info { u32 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ u64 hpfar_el2; /* Hyp IPA Fault Address Register */ + u64 disr_el1; /* Deferred [SError] Status Register */ }; /* diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 1dcc493f5765..1303e04110cd 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -132,6 +132,7 @@ int main(void) BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); + DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs)); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6a5a5db4292f..c09fc5a576c7 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,6 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, *vcpu_pc(vcpu) -= adj; } - kvm_inject_vabt(vcpu); return 1; } @@ -286,6 +286,18 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index) { + if (ARM_SERROR_PENDING(exception_index)) { + if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { + u64 disr = kvm_vcpu_get_disr(vcpu); + + kvm_handle_guest_serror(vcpu, disr_to_esr(disr)); + } else { + kvm_inject_vabt(vcpu); + } + + return; + } + exception_index = ARM_EXCEPTION_CODE(exception_index); if (exception_index == ARM_EXCEPTION_EL1_SERROR) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index fe4678f20a85..fdd1068ee3a5 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -124,6 +124,17 @@ ENTRY(__guest_exit) // Now restore the host regs restore_callee_saved_regs x2 +alternative_if ARM64_HAS_RAS_EXTN + // If we have the RAS extensions we can consume a pending error + // without an unmask-SError and isb. + esb + mrs_s x2, SYS_DISR_EL1 + str x2, [x1, #(VCPU_FAULT_DISR - VCPU_CONTEXT)] + cbz x2, 1f + msr_s SYS_DISR_EL1, xzr + orr x0, x0, #(1< Date: Mon, 15 Jan 2018 19:39:06 +0000 Subject: [PATCH 097/104] KVM: arm64: Emulate RAS error registers and set HCR_EL2's TERR & TEA ARMv8.2 adds a new bit HCR_EL2.TEA which routes synchronous external aborts to EL2, and adds a trap control bit HCR_EL2.TERR which traps all Non-secure EL1&0 error record accesses to EL2. This patch enables the two bits for the guest OS, guaranteeing that KVM takes external aborts and traps attempts to access the physical error registers. ERRIDR_EL1 advertises the number of error records, we return zero meaning we can treat all the other registers as RAZ/WI too. Signed-off-by: Dongjiu Geng [removed specific emulation, use trap_raz_wi() directly for everything, rephrased parts of the commit message] Signed-off-by: James Morse Reviewed-by: Christoffer Dall Reviewed-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kvm_arm.h | 2 ++ arch/arm64/include/asm/kvm_emulate.h | 7 +++++++ arch/arm64/include/asm/sysreg.h | 10 ++++++++++ arch/arm64/kvm/sys_regs.c | 10 ++++++++++ 4 files changed, 29 insertions(+) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 715d395ef45b..b0c84171e6a3 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -23,6 +23,8 @@ #include /* Hyp Configuration Register (HCR) bits */ +#define HCR_TEA (UL(1) << 37) +#define HCR_TERR (UL(1) << 36) #define HCR_E2H (UL(1) << 34) #define HCR_ID (UL(1) << 33) #define HCR_CD (UL(1) << 32) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index e002ab7f919a..413dc82b1e89 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -50,6 +50,13 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; if (is_kernel_in_hyp_mode()) vcpu->arch.hcr_el2 |= HCR_E2H; + if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) vcpu->arch.hcr_el2 &= ~HCR_RW; } diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 53d99c0944b1..0e1960c59197 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -176,6 +176,16 @@ #define SYS_AFSR0_EL1 sys_reg(3, 0, 5, 1, 0) #define SYS_AFSR1_EL1 sys_reg(3, 0, 5, 1, 1) #define SYS_ESR_EL1 sys_reg(3, 0, 5, 2, 0) + +#define SYS_ERRIDR_EL1 sys_reg(3, 0, 5, 3, 0) +#define SYS_ERRSELR_EL1 sys_reg(3, 0, 5, 3, 1) +#define SYS_ERXFR_EL1 sys_reg(3, 0, 5, 4, 0) +#define SYS_ERXCTLR_EL1 sys_reg(3, 0, 5, 4, 1) +#define SYS_ERXSTATUS_EL1 sys_reg(3, 0, 5, 4, 2) +#define SYS_ERXADDR_EL1 sys_reg(3, 0, 5, 4, 3) +#define SYS_ERXMISC0_EL1 sys_reg(3, 0, 5, 5, 0) +#define SYS_ERXMISC1_EL1 sys_reg(3, 0, 5, 5, 1) + #define SYS_FAR_EL1 sys_reg(3, 0, 6, 0, 0) #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9edf4ac8a320..50a43c7b97ca 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1159,6 +1159,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, + + { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi }, + { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi }, + { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, From 6b88a32c7af68895134872cdec3b6bfdb532d94e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 10 Jan 2018 13:18:30 +0000 Subject: [PATCH 098/104] arm64: kpti: Fix the interaction between ASID switching and software PAN With ARM64_SW_TTBR0_PAN enabled, the exception entry code checks the active ASID to decide whether user access was enabled (non-zero ASID) when the exception was taken. On return from exception, if user access was previously disabled, it re-instates TTBR0_EL1 from the per-thread saved value (updated in switch_mm() or efi_set_pgd()). Commit 7655abb95386 ("arm64: mm: Move ASID from TTBR0 to TTBR1") makes a TTBR0_EL1 + ASID switching non-atomic. Subsequently, commit 27a921e75711 ("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN") changes the __uaccess_ttbr0_disable() function and asm macro to first write the reserved TTBR0_EL1 followed by the ASID=0 update in TTBR1_EL1. If an exception occurs between these two, the exception return code will re-instate a valid TTBR0_EL1. Similar scenario can happen in cpu_switch_mm() between setting the reserved TTBR0_EL1 and the ASID update in cpu_do_switch_mm(). This patch reverts the entry.S check for ASID == 0 to TTBR0_EL1 and disables the interrupts around the TTBR0_EL1 and ASID switching code in __uaccess_ttbr0_disable(). It also ensures that, when returning from the EFI runtime services, efi_set_pgd() doesn't leave a non-zero ASID in TTBR1_EL1 by using uaccess_ttbr0_{enable,disable}. The accesses to current_thread_info()->ttbr0 are updated to use READ_ONCE/WRITE_ONCE. As a safety measure, __uaccess_ttbr0_enable() always masks out any existing non-zero ASID TTBR1_EL1 before writing in the new ASID. Fixes: 27a921e75711 ("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN") Acked-by: Will Deacon Reported-by: Ard Biesheuvel Tested-by: Ard Biesheuvel Reviewed-by: James Morse Tested-by: James Morse Co-developed-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 12 +++++++----- arch/arm64/include/asm/efi.h | 12 +++++++----- arch/arm64/include/asm/mmu_context.h | 3 ++- arch/arm64/include/asm/uaccess.h | 9 ++++++--- arch/arm64/kernel/entry.S | 2 +- arch/arm64/lib/clear_user.S | 2 +- arch/arm64/lib/copy_from_user.S | 2 +- arch/arm64/lib/copy_in_user.S | 2 +- arch/arm64/lib/copy_to_user.S | 2 +- arch/arm64/mm/cache.S | 2 +- arch/arm64/mm/proc.S | 3 +++ arch/arm64/xen/hypercall.S | 2 +- 12 files changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 8719ce122a38..4128bec033f6 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -14,11 +14,11 @@ #ifdef CONFIG_ARM64_SW_TTBR0_PAN .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir + bic \tmp1, \tmp1, #TTBR_ASID_MASK sub \tmp1, \tmp1, #RESERVED_TTBR0_SIZE // reserved_ttbr0 just before swapper_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb add \tmp1, \tmp1, #RESERVED_TTBR0_SIZE - bic \tmp1, \tmp1, #TTBR_ASID_MASK msr ttbr1_el1, \tmp1 // set reserved ASID isb .endm @@ -35,9 +35,11 @@ isb .endm - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption __uaccess_ttbr0_disable \tmp1 + restore_irq \tmp2 alternative_else_nop_endif .endm @@ -49,7 +51,7 @@ alternative_if_not ARM64_HAS_PAN alternative_else_nop_endif .endm #else - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 .endm .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 @@ -59,8 +61,8 @@ alternative_else_nop_endif /* * These macros are no-ops when UAO is present. */ - .macro uaccess_disable_not_uao, tmp1 - uaccess_ttbr0_disable \tmp1 + .macro uaccess_disable_not_uao, tmp1, tmp2 + uaccess_ttbr0_disable \tmp1, \tmp2 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(1) alternative_else_nop_endif diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index c4cd5081d78b..8389050328bb 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -121,19 +121,21 @@ static inline void efi_set_pgd(struct mm_struct *mm) if (mm != current->active_mm) { /* * Update the current thread's saved ttbr0 since it is - * restored as part of a return from exception. Set - * the hardware TTBR0_EL1 using cpu_switch_mm() - * directly to enable potential errata workarounds. + * restored as part of a return from exception. Enable + * access to the valid TTBR0_EL1 and invoke the errata + * workaround directly since there is no return from + * exception when invoking the EFI run-time services. */ update_saved_ttbr0(current, mm); - cpu_switch_mm(mm->pgd, mm); + uaccess_ttbr0_enable(); + post_ttbr_update_workaround(); } else { /* * Defer the switch to the current thread's TTBR0_EL1 * until uaccess_enable(). Restore the current * thread's saved ttbr0 corresponding to its active_mm */ - cpu_set_reserved_ttbr0(); + uaccess_ttbr0_disable(); update_saved_ttbr0(current, current->active_mm); } } diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 5aee19905556..8d3331985d2e 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -184,7 +184,7 @@ static inline void update_saved_ttbr0(struct task_struct *tsk, else ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48; - task_thread_info(tsk)->ttbr0 = ttbr; + WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } #else static inline void update_saved_ttbr0(struct task_struct *tsk, @@ -239,6 +239,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev,next) switch_mm(prev, next, current) void verify_cpu_asid_bits(void); +void post_ttbr_update_workaround(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 01ade20d5456..59fda5292936 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -105,16 +105,18 @@ static inline void set_fs(mm_segment_t fs) #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void __uaccess_ttbr0_disable(void) { - unsigned long ttbr; + unsigned long flags, ttbr; + local_irq_save(flags); ttbr = read_sysreg(ttbr1_el1); + ttbr &= ~TTBR_ASID_MASK; /* reserved_ttbr0 placed before swapper_pg_dir */ write_sysreg(ttbr - RESERVED_TTBR0_SIZE, ttbr0_el1); isb(); /* Set reserved ASID */ - ttbr &= ~TTBR_ASID_MASK; write_sysreg(ttbr, ttbr1_el1); isb(); + local_irq_restore(flags); } static inline void __uaccess_ttbr0_enable(void) @@ -127,10 +129,11 @@ static inline void __uaccess_ttbr0_enable(void) * roll-over and an update of 'ttbr0'. */ local_irq_save(flags); - ttbr0 = current_thread_info()->ttbr0; + ttbr0 = READ_ONCE(current_thread_info()->ttbr0); /* Restore active ASID */ ttbr1 = read_sysreg(ttbr1_el1); + ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */ ttbr1 |= ttbr0 & TTBR_ASID_MASK; write_sysreg(ttbr1, ttbr1_el1); isb(); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1e1d1842888d..b34e717d7597 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -204,7 +204,7 @@ alternative_if ARM64_HAS_PAN alternative_else_nop_endif .if \el != 0 - mrs x21, ttbr1_el1 + mrs x21, ttbr0_el1 tst x21, #TTBR_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 8f9c4641e706..3d69a8d41fa5 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -50,7 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 - uaccess_disable_not_uao x2 + uaccess_disable_not_uao x2, x3 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 69d86a80f3e2..20305d485046 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -67,7 +67,7 @@ ENTRY(__arch_copy_from_user) uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index e442b531252a..fbb090f431a5 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,7 +68,7 @@ ENTRY(raw_copy_in_user) uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret ENDPROC(raw_copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 318f15d5c336..fda6172d6b88 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -66,7 +66,7 @@ ENTRY(__arch_copy_to_user) uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret ENDPROC(__arch_copy_to_user) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 6cd20a8c0952..91464e7f77cc 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -72,7 +72,7 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU isb mov x0, #0 1: - uaccess_ttbr0_disable x1 + uaccess_ttbr0_disable x1, x2 ret 9: mov x0, #-EFAULT diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 05e4ae934b23..c6a12073ef46 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -153,6 +153,9 @@ ENDPROC(cpu_do_resume) ENTRY(cpu_do_switch_mm) mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + bfi x0, x1, #48, #16 // set the ASID field in TTBR0 +#endif bfi x2, x1, #48, #16 // set the ASID msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) isb diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index acdbd2c9e899..c5f05c4a4d00 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -107,6 +107,6 @@ ENTRY(privcmd_call) /* * Disable userspace access from kernel once the hyp call completed. */ - uaccess_ttbr0_disable x6 + uaccess_ttbr0_disable x6, x7 ret ENDPROC(privcmd_call); From e9eaa8052fe71b95f4fea6072fa3e0b2cf0b620f Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Thu, 18 Jan 2018 19:13:11 +0000 Subject: [PATCH 099/104] arm64: mm: ignore memory above supported physical address size When booting a kernel without 52-bit PA support (e.g. a kernel with 4k pages) on a system with 52-bit memory, the kernel will currently try to use the 52-bit memory and crash. Fix this by ignoring any memory higher than what the kernel supports. Fixes: f77d281713d4 ("arm64: enable 52-bit physical address support") Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 672094ed7e07..285745b2ca38 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -366,6 +366,9 @@ void __init arm64_memblock_init(void) /* Handle linux,usable-memory-range property */ fdt_enforce_memory_region(); + /* Remove memory above our supported physical address size */ + memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX); + /* * Ensure that the linear region takes up exactly half of the kernel * virtual address space. This way, we can distinguish a linear address From a8e4c0a919ae310944ed2c9ace11cf3ccd8a609b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 19 Jan 2018 15:42:09 +0000 Subject: [PATCH 100/104] arm64: Move BP hardening to check_and_switch_context We call arm64_apply_bp_hardening() from post_ttbr_update_workaround, which has the unexpected consequence of being triggered on every exception return to userspace when ARM64_SW_TTBR0_PAN is selected, even if no context switch actually occured. This is a bit suboptimal, and it would be more logical to only invalidate the branch predictor when we actually switch to a different mm. In order to solve this, move the call to arm64_apply_bp_hardening() into check_and_switch_context(), where we're guaranteed to pick a different mm context. Acked-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/mm/context.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index ff99a880a730..301417ae2ba8 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -234,6 +234,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: + + arm64_apply_bp_hardening(); + /* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when * emulating PAN. @@ -249,8 +252,6 @@ asmlinkage void post_ttbr_update_workaround(void) "ic iallu; dsb nsh; isb", ARM64_WORKAROUND_CAVIUM_27456, CONFIG_CAVIUM_ERRATUM_27456)); - - arm64_apply_bp_hardening(); } static int asids_init(void) From 55b35d070c2534dfb714b883f3c3ae05d02032da Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 17 Jan 2018 17:42:20 +0000 Subject: [PATCH 101/104] arm64: Run enable method for errata work arounds on late CPUs When a CPU is brought up after we have finalised the system wide capabilities (i.e, features and errata), we make sure the new CPU doesn't need a new errata work around which has not been detected already. However we don't run enable() method on the new CPU for the errata work arounds already detected. This could cause the new CPU running without potential work arounds. It is upto the "enable()" method to decide if this CPU should do something about the errata. Fixes: commit 6a6efbb45b7d95c84 ("arm64: Verify CPU errata work arounds on hotplugged CPU") Cc: Will Deacon Cc: Mark Rutland Cc: Andre Przywara Cc: Dave Martin Signed-off-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpu_errata.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 90a9e465339c..54e41dfe41f6 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -373,15 +373,18 @@ void verify_local_cpu_errata_workarounds(void) { const struct arm64_cpu_capabilities *caps = arm64_errata; - for (; caps->matches; caps++) - if (!cpus_have_cap(caps->capability) && - caps->matches(caps, SCOPE_LOCAL_CPU)) { + for (; caps->matches; caps++) { + if (cpus_have_cap(caps->capability)) { + if (caps->enable) + caps->enable((void *)caps); + } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) { pr_crit("CPU%d: Requires work around for %s, not detected" " at boot time\n", smp_processor_id(), caps->desc ? : "an erratum"); cpu_die_early(); } + } } void update_cpu_errata_workarounds(void) From f3d795d9b360523beca6d13ba64c2c532f601149 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 19 Jan 2018 04:22:47 -0800 Subject: [PATCH 102/104] arm64: Branch predictor hardening for Cavium ThunderX2 Use PSCI based mitigation for speculative execution attacks targeting the branch predictor. We use the same mechanism as the one used for Cortex-A CPUs, we expect the PSCI version call to have a side effect of clearing the BTBs. Acked-by: Will Deacon Signed-off-by: Jayachandran C Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpu_errata.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 54e41dfe41f6..ed6881882231 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -359,6 +359,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + .enable = enable_psci_bp_hardening, + }, #endif { } From 0ba2e29c7fc1d58a90fab614d41bf487e28e3840 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 19 Jan 2018 04:22:48 -0800 Subject: [PATCH 103/104] arm64: Turn on KPTI only on CPUs that need it Whitelist Broadcom Vulcan/Cavium ThunderX2 processors in unmap_kernel_at_el0(). These CPUs are not vulnerable to CVE-2017-5754 and do not need KPTI when KASLR is off. Acked-by: Will Deacon Signed-off-by: Jayachandran C Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5612d6f46331..4b6f9051cf0c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -868,6 +868,13 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) return true; + /* Don't force KPTI for CPUs that are not vulnerable */ + switch (read_cpuid_id() & MIDR_CPU_MODEL_MASK) { + case MIDR_CAVIUM_THUNDERX2: + case MIDR_BRCM_VULCAN: + return false; + } + /* Defer to CPU feature registers */ return !cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV3_SHIFT); From ec89ab50a03a33a4a648869e868b1964354fb2d1 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Wed, 24 Jan 2018 08:27:08 +0000 Subject: [PATCH 104/104] arm64: Fix TTBR + PAN + 52-bit PA logic in cpu_do_switch_mm In cpu_do_switch_mm(.) with ARM64_SW_TTBR0_PAN=y we apply phys_to_ttbr to a value that already has an ASID inserted into the upper bits. For 52-bit PA configurations this then can give us TTBR0_EL1 registers that cause translation table walks to attempt to access non-zero PA[51:48] spuriously. Ultimately leading to a Synchronous External Abort on level 1 translation. This patch re-arranges the logic in cpu_do_switch_mm(.) such that phys_to_ttbr is called before the ASID is inserted into the TTBR0 value. Fixes: 6b88a32c7af6 ("arm64: kpti: Fix the interaction between ASID switching and software PAN") Acked-by: Suzuki K Poulose Tested-by: Kristina Martsenko Reviewed-by: Kristina Martsenko Signed-off-by: Steve Capper Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c6a12073ef46..9f177aac6390 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -153,14 +153,14 @@ ENDPROC(cpu_do_resume) ENTRY(cpu_do_switch_mm) mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id + phys_to_ttbr x0, x3 #ifdef CONFIG_ARM64_SW_TTBR0_PAN - bfi x0, x1, #48, #16 // set the ASID field in TTBR0 + bfi x3, x1, #48, #16 // set the ASID field in TTBR0 #endif bfi x2, x1, #48, #16 // set the ASID msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) isb - phys_to_ttbr x0, x2 - msr ttbr0_el1, x2 // now update TTBR0 + msr ttbr0_el1, x3 // now update TTBR0 isb b post_ttbr_update_workaround // Back to C code... ENDPROC(cpu_do_switch_mm)